• <ins id="pjuwb"></ins>
    <blockquote id="pjuwb"><pre id="pjuwb"></pre></blockquote>
    <noscript id="pjuwb"></noscript>
          <sup id="pjuwb"><pre id="pjuwb"></pre></sup>
            <dd id="pjuwb"></dd>
            <abbr id="pjuwb"></abbr>

            socketref,再見!高德

            https://github.com/adoggie

              C++博客 :: 首頁 :: 聯(lián)系 :: 聚合  :: 管理
              246 Posts :: 4 Stories :: 312 Comments :: 0 Trackbacks

            常用鏈接

            留言簿(54)

            我參與的團(tuán)隊(duì)

            搜索

            •  

            最新評(píng)論

            閱讀排行榜

            評(píng)論排行榜

            幾個(gè)性能問題: 
              1. 商品圖像信息加入pgsql, 字段類型: model.BinaryField  (django 1.6) 
              2. django1.6提供數(shù)據(jù)庫長(zhǎng)連接功能  
            CONN_MAX_AGE 參數(shù),可加快訪問速度
              3. gevent.spawn() 創(chuàng)建所有task,將導(dǎo)致進(jìn)程被直接Kill掉,因?yàn)閮?nèi)存瘋長(zhǎng),所涉及的資源達(dá)到頂端將被OS殺死,包括:內(nèi)存overload,file handle...
                  使用 gevent.pool.Pool控制并發(fā)數(shù)量
               4. postgresql.conf 修改max_connections參數(shù)到300并發(fā)數(shù)
              5. 導(dǎo)入數(shù)據(jù)時(shí),可先去除Table的索引,加快插入速度 
              
            6 . django關(guān)閉DEBUG模式(會(huì)導(dǎo)致內(nèi)存泄露不釋放) 





            #
            --coding:utf-8--


            '''
                yixun_crawler - 易迅網(wǎng)站商品信息收集器
                author: scott
                date:

                目前僅考慮單點(diǎn)設(shè)備登錄

                lxml 的xpath還存在部分問題(還是自己沒完全領(lǐng)會(huì)呢?)

                source-code and db-sample:
                       http://114.215.178.29/static/projects/crawler/
            '''

            import imp
            imp.load_source('init','../init_script.py')
            import gevent
            import os,os.path,sys,struct,time,traceback,signal,threading,copy,base64,datetime

            # from django.db import connection
            #
             from django.db.models import Sum
            from django.db import transaction

            import yixun.models as  yixun

            from bson.objectid import ObjectId
            import lxml.etree as etree

            import urllib2,urlparse

            dbfile = 'goods.txt'

            fdbfile = open(dbfile,'wb')

            class ResourceItem:
                def __init__(self,text,href=None,tag=None,parent=None):
                    self.text = text
                    self.tag=tag
                    self.href=href
                    self.children=[]
                    self.parent = parent


            def scrape_page(url,pageIndex,cat1,cat2,cat3):
                print 'scrape_page:',url

                req = urllib2.urlopen(url)
                data = req.read()
                # savefile(data)
                html = etree.HTML(data.decode('utf-8'))

                #page size

                curPage = 0
                r = html.xpath('//*[@id="list"]/div[5]/div[2]/span/b/text()')
                if not r: return False
                curPage = r[0]
                r = html.xpath('//*[@id="list"]/div[5]/div[2]/span/text()')
                if not r : return False
                pageNum = int(r[0][1:])
                print pageNum,curPage

                #有一種情況,傳入大于總page數(shù)量的值,server會(huì)返回第一個(gè)page

                if pageIndex > pageNum:
                    return False


                #檢索品牌
                goods = html.xpath(u"//div[@class='mod_goods']")
                if not goods:
                    print 'skipped..'
                    return False
                for g in goods:
                    for e in g.getchildren():
                        if e.get('class') ==  'mod_goods_info':    #一下search動(dòng)作用xpath無法實(shí)現(xiàn),所以只好挨個(gè)查找
                            name = ''
                            price =None
                            link = ''
                            for  p in e.getchildren():
                                if p.get('class')=='mod_goods_tit':
                                    a= p.getchildren()[0]
                                    name =  a.text.encode('utf-8')
                                    link = a.get('href')

                                if p.get('class')=='mod_goods_price':
                                    price = p.getchildren()[0].getchildren()[1].text.encode('utf-8')
                            if name and price and link:
                                # print name , price ,link
                                text = "%s || %s || %s || %s || %s || %s\n"%(cat1,cat2,cat3,name,price,link.strip())

                                print text

                                gitem = yixun.GoodsItem()
                                gitem.cat1 = cat1
                                gitem.cat2 = cat2
                                gitem.cat3 = cat3
                                gitem.name = name
                                gitem.cat5 =  link

                                try:
                                    gitem.price = float(price)
                                except:
                                    pass
                                gitem.save()

                                # fdbfile.write(text)
                                # fdbfile.flush()


                return True

                # ss= p.xpath('..//dd/a')

            '''
            http://searchex.yixun.com/705740t705741-1-/?YTAG=2.1738456040037
            http://searchex.yixun.com/html?path=705740t705741&area=1&sort=0&show=0&page=2&size=40&pf=0&as=0&charset=utf-8&YTAG=2.1738456040037#list
            http://searchex.yixun.com/html?path=705740t705741&area=1&sort=0&show=0&page=1&size=40&pf=0&as=0&charset=utf-8&YTAG=2.1738456040037#list
            '''
            def scrape_cat(cat,yPageId,yPageLevel,tag,cat1,cat2,cat3):
                try:
                    print cat.href
                    #parse url
                    url = cat.href
                    fs =  urlparse.urlparse(url)
                    path,qs=fs[2],fs[4]
                    cat_idx =  path[1:].split('-')[0]
                    # tag = qs.split('=')[1]
                    tag = "%s.%s%s"%(yPageLevel,yPageId,tag)
                    #make path url
                    for page in range(1,500):
                        url = "http://searchex.yixun.com/html?path=%s&area=1&sort=0&show=0&page=%s&size=40&pf=0&as=0&charset=utf-8&YTAG=%s#list"%(cat_idx,page,tag)
                        if not scrape_page(url,page,cat1,cat2,cat3):
                            break

                    return


                except:
                    traceback.print_exc()
                    # print 'page is null,skipped..'

            def savefile(d,filename='sample.html'):
                f = open(filename,'w')
                f.write(d)
                f.close()

            def test():
                try:
                    url = 'http://searchex.yixun.com/705740t705741-1-/?YTAG=2.1738456040037'
                    fs =  urlparse.urlparse(url)
                    path,qs=fs[2],fs[4]
                    cat_idx =  path[1:].split('-')[0]
                    tag = qs.split('=')[1]
                    print cat_idx,tag

                    return

                    all_url = 'http://searchex.yixun.com/html?YTAG=3.705766287001&path=705882t705893'
                    req = urllib2.urlsplit(all_url)
                    html = req.read()
                    savefile(html)

                    dom = etree.HTML(html.decode('utf-8'))
                    p = dom.xpath(u"//div[@title='品牌']")[0]
                    ss= p.xpath('..//dd/a')
                    print ss[0].text.encode('utf-8')

                except:
                    traceback.print_exc()

            def craw_start():
                import re
                try:
                    all_url = 'http://searchex.yixun.com/?YTAG=2.1738456090000'
                    req = urllib2.urlopen(all_url)
                    html = req.read()

                    # group = re.search("window\.yPageId ='(.*?)'",html)
                    yPageId = re.search("window\.yPageId\s*=\s*'(\d+?)'",html).group(1)
                    yPageLevel = re.search("window\.yPageLevel\s*=\s*'(\d+?)'",html).group(1)
                    print yPageId,yPageLevel

                    dom = etree.HTML(html.decode('gb2312'))
                    all_cats=[]
                    cat1_list = dom.xpath("//div[@class='m_classbox']")
                    for cat in cat1_list:
                        cat1_text = cat.xpath('h3/text()')[0]
                        cat1_e = ResourceItem(cat1_text)
                        all_cats.append(cat1_e)
                        print cat1_e.text.encode('utf-8')
                        div = cat.xpath("div")[0]
                        for dl in  div.xpath('dl'):
                            cat2 = dl.xpath('dt/a')[0]
                            cat2_e = ResourceItem(cat2.text,href=cat2.attrib['href'],tag=cat2.attrib['ytag'],parent=cat1_e)
                            cat1_e.children.append(cat2_e)
                            print ' '*4,cat1_e.text.encode('utf-8'),cat2_e.href,cat2_e.tag
                            for cat3 in dl.xpath('dd/a'):
                                cat3_e = ResourceItem(cat3.text,href=cat3.attrib['href'],tag=cat3.attrib['ytag'],parent=cat2_e)
                                cat2_e.children.append(cat3_e)
                                print ' '*8,cat3_e.text.encode('utf-8'),cat3_e.href,cat3_e.tag
                    tasks =[]
                    for e1 in all_cats:
                        print '-'*1,e1.text.encode('utf-8')
                        for e2 in e1.children:
                            print '  '*2    ,e2.text.encode('utf-8')
                            for e3 in e2.children:
                                print '  '*4,e3.text.encode('utf-8')
                                task = gevent.spawn(scrape_cat,e3,yPageId,yPageLevel,e2.tag,e1.text.encode('utf-8'),e2.text.encode('utf-8'),e3.text.encode('utf-8'))
                                tasks.append(task)
                                # scrape_cat(e3,yPageId,yPageLevel,e2.tag,e1.text.encode('utf-8'),e2.text.encode('utf-8'),e3.text.encode('utf-8'))
                                # return
                    gevent.joinall(tasks)
                except:
                    traceback.print_exc()


            if __name__ == '__main__':
                craw_start()
                # test()
                pass
            posted on 2014-05-20 11:33 放屁阿狗 閱讀(17167) 評(píng)論(0)  編輯 收藏 引用
            久久国产精品无| 久久国产免费观看精品3| 久久青草国产手机看片福利盒子| 精品久久久久久成人AV| 久久99精品综合国产首页| 久久99亚洲综合精品首页| 狠狠色丁香久久婷婷综合蜜芽五月| 久久九九亚洲精品| AV无码久久久久不卡网站下载| 99精品久久精品一区二区| 亚洲国产成人久久综合一| 亚洲国产成人久久精品99 | 99久久精品国产高清一区二区 | 久久久免费观成人影院| 99蜜桃臀久久久欧美精品网站 | 无码人妻久久一区二区三区免费丨 | 亚洲成色www久久网站夜月| 久久香综合精品久久伊人| 精品久久久久久久国产潘金莲| 亚洲另类欧美综合久久图片区| 久久午夜免费视频| 久久精品国产免费观看三人同眠| 97久久精品午夜一区二区| 久久噜噜电影你懂的| 91精品国产91久久久久久青草 | 久久精品国产亚洲7777| 久久久久人妻精品一区二区三区| 久久婷婷五月综合97色| 久久久久久精品免费看SSS| 久久精品免费一区二区| 久久精品亚洲一区二区三区浴池| 成人久久精品一区二区三区| 99久久www免费人成精品| 欧美久久久久久午夜精品| 久久久久久久波多野结衣高潮 | 三级片免费观看久久| 欧美黑人激情性久久| 国产 亚洲 欧美 另类 久久| 国产产无码乱码精品久久鸭| 久久高清一级毛片| 精品无码久久久久国产动漫3d|