• <ins id="pjuwb"></ins>
    <blockquote id="pjuwb"><pre id="pjuwb"></pre></blockquote>
    <noscript id="pjuwb"></noscript>
          <sup id="pjuwb"><pre id="pjuwb"></pre></sup>
            <dd id="pjuwb"></dd>
            <abbr id="pjuwb"></abbr>

            socketref,再見!高德

            https://github.com/adoggie

              C++博客 :: 首頁 :: 聯系 :: 聚合  :: 管理
              246 Posts :: 4 Stories :: 312 Comments :: 0 Trackbacks

            常用鏈接

            留言簿(54)

            我參與的團隊

            搜索

            •  

            最新評論

            閱讀排行榜

            評論排行榜

            幾個性能問題: 
              1. 商品圖像信息加入pgsql, 字段類型: model.BinaryField  (django 1.6) 
              2. django1.6提供數據庫長連接功能  
            CONN_MAX_AGE 參數,可加快訪問速度
              3. gevent.spawn() 創建所有task,將導致進程被直接Kill掉,因為內存瘋長,所涉及的資源達到頂端將被OS殺死,包括:內存overload,file handle...
                  使用 gevent.pool.Pool控制并發數量
               4. postgresql.conf 修改max_connections參數到300并發數
              5. 導入數據時,可先去除Table的索引,加快插入速度 
              
            6 . django關閉DEBUG模式(會導致內存泄露不釋放) 





            #
            --coding:utf-8--


            '''
                yixun_crawler - 易迅網站商品信息收集器
                author: scott
                date:

                目前僅考慮單點設備登錄

                lxml 的xpath還存在部分問題(還是自己沒完全領會呢?)

                source-code and db-sample:
                       http://114.215.178.29/static/projects/crawler/
            '''

            import imp
            imp.load_source('init','../init_script.py')
            import gevent
            import os,os.path,sys,struct,time,traceback,signal,threading,copy,base64,datetime

            # from django.db import connection
            #
             from django.db.models import Sum
            from django.db import transaction

            import yixun.models as  yixun

            from bson.objectid import ObjectId
            import lxml.etree as etree

            import urllib2,urlparse

            dbfile = 'goods.txt'

            fdbfile = open(dbfile,'wb')

            class ResourceItem:
                def __init__(self,text,href=None,tag=None,parent=None):
                    self.text = text
                    self.tag=tag
                    self.href=href
                    self.children=[]
                    self.parent = parent


            def scrape_page(url,pageIndex,cat1,cat2,cat3):
                print 'scrape_page:',url

                req = urllib2.urlopen(url)
                data = req.read()
                # savefile(data)
                html = etree.HTML(data.decode('utf-8'))

                #page size

                curPage = 0
                r = html.xpath('//*[@id="list"]/div[5]/div[2]/span/b/text()')
                if not r: return False
                curPage = r[0]
                r = html.xpath('//*[@id="list"]/div[5]/div[2]/span/text()')
                if not r : return False
                pageNum = int(r[0][1:])
                print pageNum,curPage

                #有一種情況,傳入大于總page數量的值,server會返回第一個page

                if pageIndex > pageNum:
                    return False


                #檢索品牌
                goods = html.xpath(u"//div[@class='mod_goods']")
                if not goods:
                    print 'skipped..'
                    return False
                for g in goods:
                    for e in g.getchildren():
                        if e.get('class') ==  'mod_goods_info':    #一下search動作用xpath無法實現,所以只好挨個查找
                            name = ''
                            price =None
                            link = ''
                            for  p in e.getchildren():
                                if p.get('class')=='mod_goods_tit':
                                    a= p.getchildren()[0]
                                    name =  a.text.encode('utf-8')
                                    link = a.get('href')

                                if p.get('class')=='mod_goods_price':
                                    price = p.getchildren()[0].getchildren()[1].text.encode('utf-8')
                            if name and price and link:
                                # print name , price ,link
                                text = "%s || %s || %s || %s || %s || %s\n"%(cat1,cat2,cat3,name,price,link.strip())

                                print text

                                gitem = yixun.GoodsItem()
                                gitem.cat1 = cat1
                                gitem.cat2 = cat2
                                gitem.cat3 = cat3
                                gitem.name = name
                                gitem.cat5 =  link

                                try:
                                    gitem.price = float(price)
                                except:
                                    pass
                                gitem.save()

                                # fdbfile.write(text)
                                # fdbfile.flush()


                return True

                # ss= p.xpath('..//dd/a')

            '''
            http://searchex.yixun.com/705740t705741-1-/?YTAG=2.1738456040037
            http://searchex.yixun.com/html?path=705740t705741&area=1&sort=0&show=0&page=2&size=40&pf=0&as=0&charset=utf-8&YTAG=2.1738456040037#list
            http://searchex.yixun.com/html?path=705740t705741&area=1&sort=0&show=0&page=1&size=40&pf=0&as=0&charset=utf-8&YTAG=2.1738456040037#list
            '''
            def scrape_cat(cat,yPageId,yPageLevel,tag,cat1,cat2,cat3):
                try:
                    print cat.href
                    #parse url
                    url = cat.href
                    fs =  urlparse.urlparse(url)
                    path,qs=fs[2],fs[4]
                    cat_idx =  path[1:].split('-')[0]
                    # tag = qs.split('=')[1]
                    tag = "%s.%s%s"%(yPageLevel,yPageId,tag)
                    #make path url
                    for page in range(1,500):
                        url = "http://searchex.yixun.com/html?path=%s&area=1&sort=0&show=0&page=%s&size=40&pf=0&as=0&charset=utf-8&YTAG=%s#list"%(cat_idx,page,tag)
                        if not scrape_page(url,page,cat1,cat2,cat3):
                            break

                    return


                except:
                    traceback.print_exc()
                    # print 'page is null,skipped..'

            def savefile(d,filename='sample.html'):
                f = open(filename,'w')
                f.write(d)
                f.close()

            def test():
                try:
                    url = 'http://searchex.yixun.com/705740t705741-1-/?YTAG=2.1738456040037'
                    fs =  urlparse.urlparse(url)
                    path,qs=fs[2],fs[4]
                    cat_idx =  path[1:].split('-')[0]
                    tag = qs.split('=')[1]
                    print cat_idx,tag

                    return

                    all_url = 'http://searchex.yixun.com/html?YTAG=3.705766287001&path=705882t705893'
                    req = urllib2.urlsplit(all_url)
                    html = req.read()
                    savefile(html)

                    dom = etree.HTML(html.decode('utf-8'))
                    p = dom.xpath(u"//div[@title='品牌']")[0]
                    ss= p.xpath('..//dd/a')
                    print ss[0].text.encode('utf-8')

                except:
                    traceback.print_exc()

            def craw_start():
                import re
                try:
                    all_url = 'http://searchex.yixun.com/?YTAG=2.1738456090000'
                    req = urllib2.urlopen(all_url)
                    html = req.read()

                    # group = re.search("window\.yPageId ='(.*?)'",html)
                    yPageId = re.search("window\.yPageId\s*=\s*'(\d+?)'",html).group(1)
                    yPageLevel = re.search("window\.yPageLevel\s*=\s*'(\d+?)'",html).group(1)
                    print yPageId,yPageLevel

                    dom = etree.HTML(html.decode('gb2312'))
                    all_cats=[]
                    cat1_list = dom.xpath("//div[@class='m_classbox']")
                    for cat in cat1_list:
                        cat1_text = cat.xpath('h3/text()')[0]
                        cat1_e = ResourceItem(cat1_text)
                        all_cats.append(cat1_e)
                        print cat1_e.text.encode('utf-8')
                        div = cat.xpath("div")[0]
                        for dl in  div.xpath('dl'):
                            cat2 = dl.xpath('dt/a')[0]
                            cat2_e = ResourceItem(cat2.text,href=cat2.attrib['href'],tag=cat2.attrib['ytag'],parent=cat1_e)
                            cat1_e.children.append(cat2_e)
                            print ' '*4,cat1_e.text.encode('utf-8'),cat2_e.href,cat2_e.tag
                            for cat3 in dl.xpath('dd/a'):
                                cat3_e = ResourceItem(cat3.text,href=cat3.attrib['href'],tag=cat3.attrib['ytag'],parent=cat2_e)
                                cat2_e.children.append(cat3_e)
                                print ' '*8,cat3_e.text.encode('utf-8'),cat3_e.href,cat3_e.tag
                    tasks =[]
                    for e1 in all_cats:
                        print '-'*1,e1.text.encode('utf-8')
                        for e2 in e1.children:
                            print '  '*2    ,e2.text.encode('utf-8')
                            for e3 in e2.children:
                                print '  '*4,e3.text.encode('utf-8')
                                task = gevent.spawn(scrape_cat,e3,yPageId,yPageLevel,e2.tag,e1.text.encode('utf-8'),e2.text.encode('utf-8'),e3.text.encode('utf-8'))
                                tasks.append(task)
                                # scrape_cat(e3,yPageId,yPageLevel,e2.tag,e1.text.encode('utf-8'),e2.text.encode('utf-8'),e3.text.encode('utf-8'))
                                # return
                    gevent.joinall(tasks)
                except:
                    traceback.print_exc()


            if __name__ == '__main__':
                craw_start()
                # test()
                pass
            posted on 2014-05-20 11:33 放屁阿狗 閱讀(17167) 評論(0)  編輯 收藏 引用
            久久99久国产麻精品66| 午夜人妻久久久久久久久| 国内精品久久久久久久亚洲| 日本精品久久久久中文字幕8| 国产成人精品久久一区二区三区av| 伊人久久大香线蕉精品| 久久国内免费视频| 精品国产乱码久久久久久1区2区| 国产精品一区二区久久精品无码 | 麻豆成人久久精品二区三区免费 | 7777精品久久久大香线蕉| 国产精品毛片久久久久久久| 久久久久久久久久免免费精品| 久久久久久久精品妇女99| 99久久精品午夜一区二区| 久久亚洲国产最新网站| 久久亚洲国产中v天仙www| 青青草原精品99久久精品66 | 精品熟女少妇AV免费久久| 热re99久久精品国产99热| 亚洲精品无码专区久久久| 久久久WWW免费人成精品| 99国产欧美精品久久久蜜芽| 久久久久久精品免费免费自慰 | 国产精品一区二区久久不卡| 亚洲精品美女久久久久99小说| 久久国产一区二区| 99国产欧美精品久久久蜜芽| 亚洲va中文字幕无码久久| 欧美精品九九99久久在观看| 久久天天躁狠狠躁夜夜不卡 | 国内精品久久久久伊人av| 婷婷久久香蕉五月综合加勒比| 欧美成a人片免费看久久| 久久精品成人免费观看97| 久久本道综合久久伊人| 国产视频久久| 日日狠狠久久偷偷色综合免费 | 亚洲伊人久久成综合人影院| 欧美国产精品久久高清| 国产亚洲成人久久|