5173.com提出抓取同行交易系統的業務信息來做數據分析而提出這么個需求給我,花了1天用python完成
1 # -*- coding:utf-8 -*-
2 #掃描xunbao173.com web頁面記錄到數據庫
3 #zhangbin 2010.5.12 5173.com
4 import sys,os
5
6 import traceback,threading,time,struct,os,os.path,zlib,struct
7 import copy,socket,select
8 #import psycopg2
9 import httplib,re
10
11 import log,config
12
13
14 #function Paging(total) {
15 # this.pageSize = 10;//每頁顯示記錄數
16 # this.step = 5;//最多顯示分頁頁數
17 # this.total = total; //總記錄數
18 #}
19
20 '''
21
22 CREATE DATABASE htmlgrep
23 WITH OWNER = postgres
24 ENCODING = 'UTF8';
25
26
27 CREATE TABLE htmlGrep
28 (
29 id integer,
30 item_name character(60) NOT NULL,
31 price_s character(40),
32 rank integer,
33 appear_time integer NOT NULL,
34 disappear_time integer NOT NULL
35 ) WITH (OIDS=TRUE)
36 ;
37
38 '''
39
40 g_conf = config.SimpleConfig()
41 g_conf.open('grep.conf')
42 g_dbconn = None
43 g_logger = log.Logger('hgrep.log')
44 g_flog = None
45
46 g_cookie = ''
47
48 #def getDBConn():
49 # global g_dbconn
50 # try:
51 # if g_dbconn == None:
52 # dbhost=g_conf.getPropertyValue('dbhost','localhost')
53 # dbname='gamegrep'
54 # dbuser=g_conf.getPropertyValue('dbuser','postgres')
55 # dbpasswd=g_conf.getPropertyValue('dbpasswd','111111')
56 # g_dbconn = psycopg2.connect(host=dbhost,database=dbname,user=dbuser,password=dbpasswd)
57 # except:
58 # g_logger.error(traceback.format_exc())
59 # return g_dbconn
60
61 #檢索頁數量
62
63 '''
64 <input type="hidden" id="currentPage" value="1"/>
65 <input type="hidden" id="orderBy" value=""/>
66 <input type="hidden" id="pageTotal" value="24"/>
67 '''
68
69 def getPageNum(html):
70 ms = re.findall("id=\"pageTotal\" value=\"(.*?)\"",html,re.S)
71 if len(ms)==0:
72 pass
73 #return 0
74 return ms[0]
75
76 def getPageHtml(game,idx):
77 html =''
78 url = "/%s/getServerList"%(game)
79 gamesite=g_conf.getPropertyValue('root.site')
80 conn = httplib.HTTPConnection(gamesite)
81
82 hdr={'Cookie':g_cookie,
83 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Charset':'GB2312,utf-8;q=0.7,*;q=0.7',
85 'Accept-Language':'zh-cn,zh;q=0.5',
86 'Keep-Alive':'300',
87 'Connection':'keep-alive',
88 'Accept-Encoding':'gzip,deflate'
89 }
90
91 conn.request("GET", "/%s/buy.gsp?keyWord=&groupName=&orderBy=&page=%s"%(game,idx),'',hdr)
92 r1 = conn.getresponse()
93 html = r1.read()
94 return html
95
96
97 def scanRecordsOfHtml(f,html,serverid,serverName):
98
99 regex = '''<span class="realName">(.*?)</span>.*?rank.*?>(.*?)</dd>.*?price.*?>(.*?)</dd>.*?linkTo\('(.*?)','''
100 ms = re.findall(regex,html,re.S)
101 #f = open('hgrep.rec.txt','w')
102 #conn = getDBConn()
103 for n in ms:
104 f.write("%s,%s,%s,%s,%s,%s\n"%(n[0],n[1],n[2],n[3],serverid,serverName))
105
106
107 #掃描游戲服務器 [{name,url}]
108 def scanGameServers(game):
109 global g_cookie
110 url = "/%s/getServerList"%(game)
111 gamesite=g_conf.getPropertyValue('root.site')
112 conn = httplib.HTTPConnection(gamesite)
113 conn.request("GET", url)
114 r1 = conn.getresponse()
115 html = r1.read()
116 #print html
117 ms = re.findall("<div class=\"ser_area_list\">(.*?)</div>",html,re.S)
118 if len(ms)!=2:
119 print 'Html content invalid!'
120 return
121 html = ms[1]
122 #ms = re.findall("<a href=\"(.*?)\">.*?title=\"(.*?)\".*?</a>",html,re.S)
123 #"getServerList?aid=15&id=1136"
124 ms = re.findall("<a href=\".*?aid=(.*?)&id=(.*?)\">.*?title=\"(.*?)\".*?</a>",html,re.S)
125
126 if len(ms) == 0:
127 print 'Game:%s is null!'%(game)
128 return
129 print '%s servers Found'%len(ms)
130 #print r1.getheader('set-cookie')
131 cookie = r1.getheader('set-cookie').split(';')[0]
132 g_cookie = cookie
133 #進入游戲區服
134 scanedserver =[]
135
136 for server in ms:
137 try:
138
139 #if scanedserver.count(server[1])!=0:
140 # continue
141 #scanedserver.append(server[1])
142 print server
143 f = open(server[2].decode('utf-8').encode('gb2312')+'.txt','w')
144 print "/%s/%s"%(game,server[0])
145 url = "getServerList?aid=%s&id=%s"%(server[:2])
146 print url
147 #return url
148 conn = httplib.HTTPConnection(gamesite)
149
150 #conn.request("GET", "/%s/%s/"%(game,server[0]),None,{'Cookie':cookie})
151 hdr={'Cookie':cookie,
152 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
153 'Accept-Charset':'GB2312,utf-8;q=0.7,*;q=0.7',
154 'Accept-Language':'zh-cn,zh;q=0.5',
155 'Keep-Alive':'300',
156 'Connection':'keep-alive',
157 'Accept-Encoding':'gzip,deflate'
158 }
159
160 #conn.request("GET", "/%s/%s"%(game,server[0]),'',hdr)
161 conn.request("GET", "/%s/%s"%(game,url),'',hdr)
162 r1 = conn.getresponse()
163 html = r1.read()
164 print 'have a sleep
'
165 time.sleep(.2)
166
167 conn = httplib.HTTPConnection(gamesite)
168 conn.request("GET", "/%s/%s"%(game,'buy.gsp'),'',hdr)
169 r1 = conn.getresponse()
170 html = r1.read()
171
172 PAGE_SIZE =10
173 PAGE_COUNT = int(getPageNum(html))/PAGE_SIZE + 1
174
175 for page in range(1,PAGE_COUNT+1):
176 print 'attempt to grep Game=%s Page=%s
'%(game,page)
177 html = getPageHtml(game,page)
178 g_flog.write( html)
179
180 scanRecordsOfHtml(f,html,server[1],server[2])
181 f.close()
182
183 except:
184 g_logger.error(traceback.format_exc())
185
186
187
188 #def scanGameServers2(game):
189 # url = "/%s/buy.gsp"%(game)
190 # gamesite=g_conf.getPropertyValue('root.site')
191 # conn = httplib.HTTPConnection(gamesite)
192 # conn.request("GET", url)
193 # print url
194 # r1 = conn.getresponse()
195 # html = r1.read()
196 # #print html
197 # print html
198 # g_flog.write( html)
199 #
200 #def scanRecords(file):
201 # f = open(file,'r')
202 # html = f.read()
203 # f.close()
204 # regex = '''<span class="realName">(.*?)</span>.*?rank.*?>(.*?)</dd>.*?price.*?>(.*?)</dd>.*?linkTo\('(.*?)','''
205 # ms = re.findall(regex,html,re.S)
206 # f = open('hgrep.rec.txt','w')
207 # conn = getDBConn()
208 #
209 # for n in ms:
210 # f.write("%s,%s,%s,%s\n"%n)
211 # try:
212 # cr = conn.cursor()
213 # sql = "select count(*) from htmlgrep where id=%s"%(n[3])
214 # cr.execute(sql)
215 #
216 # rs = cr.fetchone()
217 # if rs[0] == 0 :
218 # #if 1:
219 # #cr = conn.cursor()
220 # sql="insert into htmlgrep values(%s,%s,%s,%s,%s,%s);"
221 # cr.execute(sql,( int(n[3]),n[0],n[2],n[1],int(time.time()),0,))
222 # conn.commit()
223 # else:
224 # sql = "update htmlgrep set disappear_time=0 where id=%s"%(int(n[3]))
225 # cr.execute(sql)
226 # conn.commit()
227 #
228 # except:
229 # g_logger.error(traceback.format_exc())
230 # #如果db內的數據不存在當前緩存內則標記為物品消失,并記錄消失時間
231 # cr = conn.cursor()
232 # cr.execute('select id from htmlgrep order by id')
233 # rs = cr.fetchone()
234 # while rs:
235 # found = False
236 # for n in ms:
237 # if int(n[3]) == rs[0]:
238 # found = True
239 # break
240 # if not found:
241 # cr2 = conn.cursor()
242 # sql = "update htmlgrep set disappear_time=%s where id=%s"%(int(time.time()),rs[0])
243 # cr2.execute(sql)
244 # rs = cr.fetchone()
245 # conn.commit()
246 # f.close()
247 # #print str(ms)
248 ##############################################################
249
250
251 class sepApp:
252 def __init__(self):
253 self._conf = config.SimpleConfig()
254
255
256 def getConfig(self):
257 return self._conf
258
259 #def getDBConn(self):
260 # try:
261 # if self.dbconn == None:
262 # dbhost=self.getPropertyValue('dbhost','localhost')
263 # dbname=self.getPropertyValue('dbname','IpRedirect')
264 # dbuser=self.getPropertyValue('dbuser','postgres')
265 # dbpasswd=self.getPropertyValue('dbpasswd','111111')
266 # self.dbconn = psycopg2.connect(host=dbhost,database=dbname,user=dbuser,password=dbpasswd)
267 # except:
268 # self._log.error(traceback.format_exc())
269 # return self.dbconn
270
271 #def resetDBConn(self):
272 # self.dbconn = None
273
274 def run(self):
275 pass
276
277 def getLogger(self):
278 return self._log
279
280 def run(self, args):
281
282 return 0
283
284
285
286 ##############################################################
287 ##############################################################
288
289 #scanRecords('c:/test - Copy.html')
290
291
292 '''
293 sql test:
294 ---------------------
295 --select count(*) from htmlgrep
296 --select id,count(id) from htmlgrep group by id limit 100
297 --select * from htmlgrep where id = 2310
298 --delete from htmlgrep
299
300 '''
301 g_flog = open('c:/test.txt','w')
302 scanGameServers('mhzx')
303 sys.exit(0)
304 if __name__=='__main__':
305 if len(sys.argv)<2:
306 print 'usage: grep.py scan | build'
307 sys.exit()
308 if sys.argv[1]=='scan':
309 g_flog = open('c:/test.txt','w')
310 scanGameServers('mhzx')
311 #if sys.argv[1]=='build':
312 # scanRecords('c:/test.txt')
313 #server = sepApp()
314
315
316
317
318
2 #掃描xunbao173.com web頁面記錄到數據庫
3 #zhangbin 2010.5.12 5173.com
4 import sys,os
5
6 import traceback,threading,time,struct,os,os.path,zlib,struct
7 import copy,socket,select
8 #import psycopg2
9 import httplib,re
10
11 import log,config
12
13
14 #function Paging(total) {
15 # this.pageSize = 10;//每頁顯示記錄數
16 # this.step = 5;//最多顯示分頁頁數
17 # this.total = total; //總記錄數
18 #}
19
20 '''
21
22 CREATE DATABASE htmlgrep
23 WITH OWNER = postgres
24 ENCODING = 'UTF8';
25
26
27 CREATE TABLE htmlGrep
28 (
29 id integer,
30 item_name character(60) NOT NULL,
31 price_s character(40),
32 rank integer,
33 appear_time integer NOT NULL,
34 disappear_time integer NOT NULL
35 ) WITH (OIDS=TRUE)
36 ;
37
38 '''
39
40 g_conf = config.SimpleConfig()
41 g_conf.open('grep.conf')
42 g_dbconn = None
43 g_logger = log.Logger('hgrep.log')
44 g_flog = None
45
46 g_cookie = ''
47
48 #def getDBConn():
49 # global g_dbconn
50 # try:
51 # if g_dbconn == None:
52 # dbhost=g_conf.getPropertyValue('dbhost','localhost')
53 # dbname='gamegrep'
54 # dbuser=g_conf.getPropertyValue('dbuser','postgres')
55 # dbpasswd=g_conf.getPropertyValue('dbpasswd','111111')
56 # g_dbconn = psycopg2.connect(host=dbhost,database=dbname,user=dbuser,password=dbpasswd)
57 # except:
58 # g_logger.error(traceback.format_exc())
59 # return g_dbconn
60
61 #檢索頁數量
62
63 '''
64 <input type="hidden" id="currentPage" value="1"/>
65 <input type="hidden" id="orderBy" value=""/>
66 <input type="hidden" id="pageTotal" value="24"/>
67 '''
68
69 def getPageNum(html):
70 ms = re.findall("id=\"pageTotal\" value=\"(.*?)\"",html,re.S)
71 if len(ms)==0:
72 pass
73 #return 0
74 return ms[0]
75
76 def getPageHtml(game,idx):
77 html =''
78 url = "/%s/getServerList"%(game)
79 gamesite=g_conf.getPropertyValue('root.site')
80 conn = httplib.HTTPConnection(gamesite)
81
82 hdr={'Cookie':g_cookie,
83 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Charset':'GB2312,utf-8;q=0.7,*;q=0.7',
85 'Accept-Language':'zh-cn,zh;q=0.5',
86 'Keep-Alive':'300',
87 'Connection':'keep-alive',
88 'Accept-Encoding':'gzip,deflate'
89 }
90
91 conn.request("GET", "/%s/buy.gsp?keyWord=&groupName=&orderBy=&page=%s"%(game,idx),'',hdr)
92 r1 = conn.getresponse()
93 html = r1.read()
94 return html
95
96
97 def scanRecordsOfHtml(f,html,serverid,serverName):
98
99 regex = '''<span class="realName">(.*?)</span>.*?rank.*?>(.*?)</dd>.*?price.*?>(.*?)</dd>.*?linkTo\('(.*?)','''
100 ms = re.findall(regex,html,re.S)
101 #f = open('hgrep.rec.txt','w')
102 #conn = getDBConn()
103 for n in ms:
104 f.write("%s,%s,%s,%s,%s,%s\n"%(n[0],n[1],n[2],n[3],serverid,serverName))
105
106
107 #掃描游戲服務器 [{name,url}]
108 def scanGameServers(game):
109 global g_cookie
110 url = "/%s/getServerList"%(game)
111 gamesite=g_conf.getPropertyValue('root.site')
112 conn = httplib.HTTPConnection(gamesite)
113 conn.request("GET", url)
114 r1 = conn.getresponse()
115 html = r1.read()
116 #print html
117 ms = re.findall("<div class=\"ser_area_list\">(.*?)</div>",html,re.S)
118 if len(ms)!=2:
119 print 'Html content invalid!'
120 return
121 html = ms[1]
122 #ms = re.findall("<a href=\"(.*?)\">.*?title=\"(.*?)\".*?</a>",html,re.S)
123 #"getServerList?aid=15&id=1136"
124 ms = re.findall("<a href=\".*?aid=(.*?)&id=(.*?)\">.*?title=\"(.*?)\".*?</a>",html,re.S)
125
126 if len(ms) == 0:
127 print 'Game:%s is null!'%(game)
128 return
129 print '%s servers Found'%len(ms)
130 #print r1.getheader('set-cookie')
131 cookie = r1.getheader('set-cookie').split(';')[0]
132 g_cookie = cookie
133 #進入游戲區服
134 scanedserver =[]
135
136 for server in ms:
137 try:
138
139 #if scanedserver.count(server[1])!=0:
140 # continue
141 #scanedserver.append(server[1])
142 print server
143 f = open(server[2].decode('utf-8').encode('gb2312')+'.txt','w')
144 print "/%s/%s"%(game,server[0])
145 url = "getServerList?aid=%s&id=%s"%(server[:2])
146 print url
147 #return url
148 conn = httplib.HTTPConnection(gamesite)
149
150 #conn.request("GET", "/%s/%s/"%(game,server[0]),None,{'Cookie':cookie})
151 hdr={'Cookie':cookie,
152 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
153 'Accept-Charset':'GB2312,utf-8;q=0.7,*;q=0.7',
154 'Accept-Language':'zh-cn,zh;q=0.5',
155 'Keep-Alive':'300',
156 'Connection':'keep-alive',
157 'Accept-Encoding':'gzip,deflate'
158 }
159
160 #conn.request("GET", "/%s/%s"%(game,server[0]),'',hdr)
161 conn.request("GET", "/%s/%s"%(game,url),'',hdr)
162 r1 = conn.getresponse()
163 html = r1.read()
164 print 'have a sleep

165 time.sleep(.2)
166
167 conn = httplib.HTTPConnection(gamesite)
168 conn.request("GET", "/%s/%s"%(game,'buy.gsp'),'',hdr)
169 r1 = conn.getresponse()
170 html = r1.read()
171
172 PAGE_SIZE =10
173 PAGE_COUNT = int(getPageNum(html))/PAGE_SIZE + 1
174
175 for page in range(1,PAGE_COUNT+1):
176 print 'attempt to grep Game=%s Page=%s

177 html = getPageHtml(game,page)
178 g_flog.write( html)
179
180 scanRecordsOfHtml(f,html,server[1],server[2])
181 f.close()
182
183 except:
184 g_logger.error(traceback.format_exc())
185
186
187
188 #def scanGameServers2(game):
189 # url = "/%s/buy.gsp"%(game)
190 # gamesite=g_conf.getPropertyValue('root.site')
191 # conn = httplib.HTTPConnection(gamesite)
192 # conn.request("GET", url)
193 # print url
194 # r1 = conn.getresponse()
195 # html = r1.read()
196 # #print html
197 # print html
198 # g_flog.write( html)
199 #
200 #def scanRecords(file):
201 # f = open(file,'r')
202 # html = f.read()
203 # f.close()
204 # regex = '''<span class="realName">(.*?)</span>.*?rank.*?>(.*?)</dd>.*?price.*?>(.*?)</dd>.*?linkTo\('(.*?)','''
205 # ms = re.findall(regex,html,re.S)
206 # f = open('hgrep.rec.txt','w')
207 # conn = getDBConn()
208 #
209 # for n in ms:
210 # f.write("%s,%s,%s,%s\n"%n)
211 # try:
212 # cr = conn.cursor()
213 # sql = "select count(*) from htmlgrep where id=%s"%(n[3])
214 # cr.execute(sql)
215 #
216 # rs = cr.fetchone()
217 # if rs[0] == 0 :
218 # #if 1:
219 # #cr = conn.cursor()
220 # sql="insert into htmlgrep values(%s,%s,%s,%s,%s,%s);"
221 # cr.execute(sql,( int(n[3]),n[0],n[2],n[1],int(time.time()),0,))
222 # conn.commit()
223 # else:
224 # sql = "update htmlgrep set disappear_time=0 where id=%s"%(int(n[3]))
225 # cr.execute(sql)
226 # conn.commit()
227 #
228 # except:
229 # g_logger.error(traceback.format_exc())
230 # #如果db內的數據不存在當前緩存內則標記為物品消失,并記錄消失時間
231 # cr = conn.cursor()
232 # cr.execute('select id from htmlgrep order by id')
233 # rs = cr.fetchone()
234 # while rs:
235 # found = False
236 # for n in ms:
237 # if int(n[3]) == rs[0]:
238 # found = True
239 # break
240 # if not found:
241 # cr2 = conn.cursor()
242 # sql = "update htmlgrep set disappear_time=%s where id=%s"%(int(time.time()),rs[0])
243 # cr2.execute(sql)
244 # rs = cr.fetchone()
245 # conn.commit()
246 # f.close()
247 # #print str(ms)
248 ##############################################################
249
250
251 class sepApp:
252 def __init__(self):
253 self._conf = config.SimpleConfig()
254
255
256 def getConfig(self):
257 return self._conf
258
259 #def getDBConn(self):
260 # try:
261 # if self.dbconn == None:
262 # dbhost=self.getPropertyValue('dbhost','localhost')
263 # dbname=self.getPropertyValue('dbname','IpRedirect')
264 # dbuser=self.getPropertyValue('dbuser','postgres')
265 # dbpasswd=self.getPropertyValue('dbpasswd','111111')
266 # self.dbconn = psycopg2.connect(host=dbhost,database=dbname,user=dbuser,password=dbpasswd)
267 # except:
268 # self._log.error(traceback.format_exc())
269 # return self.dbconn
270
271 #def resetDBConn(self):
272 # self.dbconn = None
273
274 def run(self):
275 pass
276
277 def getLogger(self):
278 return self._log
279
280 def run(self, args):
281
282 return 0
283
284
285
286 ##############################################################
287 ##############################################################
288
289 #scanRecords('c:/test - Copy.html')
290
291
292 '''
293 sql test:
294 ---------------------
295 --select count(*) from htmlgrep
296 --select id,count(id) from htmlgrep group by id limit 100
297 --select * from htmlgrep where id = 2310
298 --delete from htmlgrep
299
300 '''
301 g_flog = open('c:/test.txt','w')
302 scanGameServers('mhzx')
303 sys.exit(0)
304 if __name__=='__main__':
305 if len(sys.argv)<2:
306 print 'usage: grep.py scan | build'
307 sys.exit()
308 if sys.argv[1]=='scan':
309 g_flog = open('c:/test.txt','w')
310 scanGameServers('mhzx')
311 #if sys.argv[1]=='build':
312 # scanRecords('c:/test.txt')
313 #server = sepApp()
314
315
316
317
318