锘??xml version="1.0" encoding="utf-8" standalone="yes"?>精品久久国产一区二区三区香蕉 ,久久WWW免费人成一看片,久久亚洲高清观看http://www.shnenglu.com/jrckkyy/archive/2010/03/15/109755.html瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/dc:creator>瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/author>Mon, 15 Mar 2010 11:24:00 GMThttp://www.shnenglu.com/jrckkyy/archive/2010/03/15/109755.htmlhttp://www.shnenglu.com/jrckkyy/comments/109755.htmlhttp://www.shnenglu.com/jrckkyy/archive/2010/03/15/109755.html#Feedback0http://www.shnenglu.com/jrckkyy/comments/commentRss/109755.htmlhttp://www.shnenglu.com/jrckkyy/services/trackbacks/109755.html鍓嶄竴闃佃姳浜嗙偣鏃墮棿瀛︿範python錛岃繎孌墊椂闂村畬鎴愪簡涓涓洃鎺ф湇鍔″櫒鍩烘湰淇℃伅鐨勯」鐩紝閮芥槸涓轟簡婊¤凍澶у鐩戞帶鐨勬鏈涳紝鐗規畩鏃ュ織騫舵姤璀︾殑鍒嗗竷寮忕郴緇燂紝鍗曞彴鏈嶅姟鍣ㄩ噰闆嗙矑搴︿負1嬈?1鍒嗛挓錛屼竴澶╁ぇ綰?440鏉★紝鐩墠鐩戞帶浜?0澶氬彴鏈嶅姟鍣紝涓澶╁ぇ綰?1680鏉℃棩蹇楋紝鐜板湪鍗曠偣鐩戞帶涓績鏈嶅姟鍣ㄥ湪鎬ц兘涓婅繕緇扮話鏈変綑錛屾湁鏇村鐨勬湇鍔″櫒鏉ユ祴璇曞氨濂戒簡錛屼及璁″彲浠ユ敮鎸佸埌100鍙頒互涓婃湇鍔″櫒鐩戞帶鐨勭駭鍒?/p>

鐜板湪閬囧埌涓涓渶姹傛槸鍙戠幇鎶ヨ鏃跺疄鏃跺彂閫佹秷鎭粰鐩稿叧浜哄憳錛岀敱浜庡叕鍙哥煭淇$綉鍏沖彧涔頒簡涓婃搗鐢典俊鐢ㄦ埛娌℃湁涓婃搗鐢典俊鐨勫彿鐮侊紝姹椾竴涓紝鍙ソ閫氳繃鍙戦偖浠舵潵瀹炴柦銆?/p>

鏀寔鍙戦丟B18030緙栫爜鐨勬枃鏈唴瀹癸紝浠繪剰緙栫爜闄勪歡錛屽彲浠ュ仛鍑洪傚綋淇敼鏀寔緹ゅ彂銆?/p>

 

·········10········20········30········40········50········60········70········80········90········100·······110·······120·······130·······140·······150
  1. #coding=utf-8   
  2. #!/usr/lib/python2.5/bin/python   
  3. import os   
  4. import sys   
  5. from smtplib import SMTP   
  6. from email.MIMEMultipart import MIMEMultipart   
  7. from email.mime.application import MIMEApplication   
  8. from email.MIMEText import MIMEText   
  9. from email.MIMEBase import MIMEBase   
  10. from email import Utils,Encoders   
  11. import mimetypes   
  12. import time   
  13.   
  14. STMP_SERVER = "mail.×××.com"  
  15. STMP_PORT = "25"  
  16. USERNAME = "×××@×××.com"  
  17. USERPASSWORD = "×××"  
  18. FROM = "MonitorCenterWarning@×××.com"  
  19. TO = "×××@gmail.com"  
  20.   
  21. def sendFildByMail(config):   
  22.     print 'Preparing...'  
  23.     message = MIMEMultipart( )   
  24.     message['from'] = config['from']   
  25.     message['to'] = config['to']   
  26.     message['Reply-To'] = config['from']   
  27.     message['Subject'] = config['subject']   
  28.     message['Date'] = time.ctime(time.time())   
  29.     message['X-Priority'] =  '3'  
  30.     message['X-MSMail-Priority'] =  'Normal'  
  31.     message['X-Mailer'] =  'Microsoft Outlook Express 6.00.2900.2180'  
  32.     message['X-MimeOLE'] =  'Produced By Microsoft MimeOLE V6.00.2900.2180'  
  33.        
  34.     if 'file' in config:   
  35.         #娣誨姞闄勪歡   
  36.         f=open(config['file'], 'rb')   
  37.         file = MIMEApplication(f.read())   
  38.         f.close()   
  39.         file.add_header('Content-Disposition''attachment', filename= os.path.basename(config['file']))   
  40.         message.attach(file)   
  41.        
  42.     if 'content' in config:   
  43.         #娣誨姞鏂囨湰鍐呭   
  44.         f=open(config['content'], 'rb')   
  45.         f.seek(0)   
  46.         content = f.read()   
  47.         body = MIMEText(content, 'base64''gb2312')   
  48.         message.attach(body)   
  49.   
  50.     print 'OKay'  
  51.     print 'Logging...'  
  52.     smtp = SMTP(config['server'], config['port'])   
  53.     #濡傛灉SMTP鏈嶅姟鍣ㄥ彂閭歡鏃朵笉闇瑕侀獙璇佺櫥褰曞垯瀵逛笅闈㈣繖琛屽姞涓婃敞閲?  
  54.     smtp.login(config['username'], config['password'])   
  55.     print 'OK'  
  56.        
  57.     print 'Sending...',   
  58.     smtp.sendmail (config['from'], [config['from'], config['to']], message.as_string())   
  59.     print 'OK'  
  60.     smtp.close()   
  61.     time.sleep(1)   
  62.   
  63. if __name__ == "__main__":   
  64.     if len(sys.argv) < 2:   
  65.         print 'Usage: python %s contentfilename' % os.path.basename(sys.argv[0])   
  66.         print 'OR Usage: python %s contentfilename attachfilename' % os.path.basename(sys.argv[0])   
  67.         wait=raw_input("quit.")   
  68.         sys.exit(-1)   
  69.     elif len(sys.argv) == 2:   
  70.         sendFildByMail({   
  71.             'from': FROM,   
  72.             'to': TO,   
  73.             'subject''[MonitorCenter]Send Msg %s' % sys.argv[1],   
  74.             'content': sys.argv[1],   
  75.             'server': STMP_SERVER,   
  76.             'port': STMP_PORT,   
  77.             'username': USERNAME,   
  78.             'password': USERPASSWORD})   
  79.     elif len(sys.argv) == 3:   
  80.         sendFildByMail({   
  81.             'from': FROM,   
  82.             'to': TO,   
  83.             'subject''[MonitorCenter]Send Msg and File %s %s' % (sys.argv[1], sys.argv[2]),   
  84.             'content': sys.argv[1],   
  85.             'file': sys.argv[2],   
  86.             'server': STMP_SERVER,   
  87.             'port': STMP_PORT,   
  88.             'username': USERNAME,   
  89.             'password': USERPASSWORD})   
  90.     wait=raw_input("end.")  

 

windows xp涓嬶細

渚嬪瓙

 linux ubuntu錛宻use涓嬶細

1

鏀跺埌鐨勭粨鏋滐細

2



]]>
[鍒嗗竷寮忚法騫沖彴鐩戞帶緋葷粺]linux涓嬬洃鎺х綉緇滄祦閲忓拰緗戦?python鑴氭湰搴旂敤http://www.shnenglu.com/jrckkyy/archive/2010/03/15/109754.html瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/dc:creator>瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/author>Mon, 15 Mar 2010 11:22:00 GMThttp://www.shnenglu.com/jrckkyy/archive/2010/03/15/109754.htmlhttp://www.shnenglu.com/jrckkyy/comments/109754.htmlhttp://www.shnenglu.com/jrckkyy/archive/2010/03/15/109754.html#Feedback0http://www.shnenglu.com/jrckkyy/comments/commentRss/109754.htmlhttp://www.shnenglu.com/jrckkyy/services/trackbacks/109754.html鐢變簬涓婅瘉鎵錛屾繁浜ゆ墍level1錛宭evel2閲戣瀺鏁版嵁鏈嶅姟鍣ㄥ湪涓婂崍9錛?0寮濮嬪埌11錛?0鍜屼笅鍗?3錛?0寮濮嬪埌15錛?0涓鍏卞ぇ綰?涓皬鏃剁殑鏃墮棿鍐呮祦閲忔瘮杈冨ぇ鎵浠ヨ鐩戞帶鏈嶅姟鍣ㄧ殑緗戠粶嫻侀熺畻鏄竴涓鐩戞帶鐨勯噸瑕佹寚鏍囥傚彲浠ラ氳繃绱姞涓孌墊椂闂村唴鍚勪釜緗戝崱鐨勪笂琛岋紝涓嬭嫻侀噺闄や互榪欎釜鏃墮棿闂撮殧璁$畻鍑鴻繖孌墊椂闂村唴鐨勫鉤鍧囩綉閫燂紝鎴戠幇鍦ㄧ殑閲囬泦棰戠巼鏄?鍒嗛挓閲囬泦涓嬈★紝鍦ㄥ疄闄呭紑鐩樻湡闂磋繍琛岃繃紼嬩腑寰楀埌鐨勭綉閫熺洃鎺т俊鎭敤榪樻槸姣旇緝鍑嗙‘鐨勶紝閮戒繚鎸佸湪5M/S宸﹀彸鐨勯熷害錛屾湁鏃跺欏湪騫蟲椂闈炴湇鍔℃湡鐪嬭鏌愬彴鏈嶅姟鍣ㄧ殑鍐呯綉緗戝崱緗戦熻揪鍒?M/S 錛屾灉鐒跺氨鏄湁浜哄湪澶ф墜絎斾紶杈撱?/p>

鐙珛鐨勭洃鎺ц剼鏈槸榪斿洖涓涓垪琛ㄥ祵濂楀厓緇勭殑鏁版嵁緇撴瀯錛屾渶鍚庡啀姹囨繪垚涓涓畬鏁寸殑XML鏁版嵁宀涳紝涓轟簡璋冭瘯鏂逛究鑴氭湰鐨勬瘡涓涓腑闂寸粨鏋滈兘瀵煎嚭鍒頒竴涓復鏃舵枃鏈腑銆?/p>

榪愯浠ヤ笅鑴氭湰瑕佺‘瀹氫綘鐨刲inux瑁呬簡ethtool宸ュ叿錛屽湪ubuntu2.6.27-7-server錛寀buntu22.6.27.19-5-default錛宻use 2.6.27.19-5-default 嫻嬭瘯閫氳繃銆?/p>

浠g爜錛?/p>

 

  1. #coding=utf-8   
  2. #!/usr/bin/python   
  3. import re   
  4. import os   
  5. import time   
  6.   
  7. import utils   
  8. def sortedDictValues3(adict):   
  9.     keys = adict.keys()   
  10.     keys.sort()   
  11.     return map(adict.get, keys)   
  12.   
  13. def run():   
  14.     if utils.isLinux() == False:   
  15.         return [('ifconfig_collect os type error','this is windows')]   
  16.     #not first run   
  17.     if os.path.isfile('./oldifconfig'):   
  18.         fileold = open('./oldifconfig''r')   
  19.         fileold.seek(0)   
  20.         #璇誨叆涓婃璁板綍鐨勪復鏃舵祦閲忔暟鎹枃浠訛紝鍜屾椂闂存埑   
  21.         (oldtime, fileoldcontent) = fileold.read().split('#')   
  22.         fileold.close;   
  23.         netcard = {}   
  24.         tempstr = ''  
  25.         key = ''  
  26.         for strline in fileoldcontent.split('\n'):   
  27.             reobj = re.compile('^lo*.')   
  28.             if reobj.search(strline):   
  29.                 break;   
  30.             reobj = re.compile('^eth*.')   
  31.             if reobj.search(strline):   
  32.                 key = strline.split()[0]   
  33.             tempstr = tempstr + strline + '\n'  
  34.             netcard[key] = tempstr   
  35.         RXold = {}   
  36.         TXold = {}   
  37.         for key,value in netcard.items():   
  38.             tempsplit = value.split('\n')   
  39.             netcard[key] = ''  
  40.             for item in tempsplit:   
  41.                 item = item + '<br>'  
  42.                 netcard[key] = netcard[key] + item   
  43.                 tempcount = 1  
  44.                 for match in re.finditer("(bytes:)(.*?)( \()", item):   
  45.                     if tempcount == 1:   
  46.                         RXold[key] = match.group(2)   
  47.                         tempcount = tempcount + 1  
  48.                     elif tempcount == 2:   
  49.                         TXold[key] = match.group(2)   
  50.                         netcard[key] = netcard[key] + 'net io percent(bytes/s): 0 <br>'  
  51.            
  52.         #璁板綍褰撳墠緗戝崱淇℃伅鍒頒復鏃舵枃浠朵腑   
  53.         os.system('ifconfig > ifconfigtemp')   
  54.         file = open('./ifconfigtemp','r');   
  55.         fileold = open('./oldifconfig''w')   
  56.         temptimestr = str(int(time.time()));   
  57.         fileold.write(temptimestr)   
  58.         fileold.write('#')   
  59.         file.seek(0)   
  60.         fileold.write(file.read())   
  61.         fileold.close()   
  62.         returnkeys = []   
  63.         returnvalues = []   
  64.         netcard = {}   
  65.         tempcountcard = 0  
  66.         file.seek(0)   
  67.         key = ''  
  68.         for strline in file.readlines():   
  69.             reobj = re.compile('^lo*.')   
  70.             if reobj.search(strline):   
  71.                 break;   
  72.             reobj = re.compile('^eth*.')   
  73.             if reobj.search(strline):   
  74.                 key = strline.split()[0]   
  75.                 netcard[key] = ''  
  76.             netcard[key] = netcard[key] + strline   
  77.         newnetcard = {}   
  78.         file.seek(0)   
  79.         key = ''  
  80.         for strline in file.readlines():   
  81.             reobj = re.compile('^lo*.')   
  82.             if reobj.search(strline):   
  83.                 break;   
  84.             if re.search("^eth", strline):   
  85.                 templist = strline.split()   
  86.                 key = templist[0]   
  87.                 newnetcard[key] = ''  
  88.                 newnetcard[key] = templist[4] + newnetcard[key] + ' '  
  89.             if re.search("^ *inet ", strline):   
  90.                 templist = strline.split()   
  91.                 newnetcard[key] = templist[1][5:] + ' ' + newnetcard[key] + ' '  
  92.         for key,value in newnetcard.items():   
  93.             #璁板綍姣忓紶緗戝崱鏄惁宸ヤ綔鐘舵佷俊鎭埌涓存椂鏂囦歡   
  94.             os.system('ethtool %s > ethtooltemp'%(key))   
  95.             file = open('./ethtooltemp','r');   
  96.             tempethtooltemplist = file.read().split('\n\t')   
  97.             file.close   
  98.             if re.search("yes", tempethtooltemplist[-1]):   
  99.                 templist = newnetcard[key].split()   
  100.                 newnetcard[key] = templist[0] + ' runing! ' + templist[1]   
  101.             else:   
  102.                 templist = newnetcard[key].split()   
  103.                 if len(templist) > 1:   
  104.                     newnetcard[key] = templist[0] + ' stop! ' + templist[1]   
  105.                 else:   
  106.                     newnetcard[key] =  'stop! ' + templist[0]   
  107.         file.close()   
  108.         RX = {}   
  109.         TX = {}   
  110.         for key,value in netcard.items():   
  111.             tempsplit = value.split('\n')   
  112.             netcard[key] = ''  
  113.             for item in tempsplit:   
  114.                 item = item + '<br>'  
  115.                 netcard[key] = netcard[key] + item   
  116.                 tempcount = 1  
  117.                 for match in re.finditer("(bytes:)(.*?)( \()", item):   
  118.                     if tempcount == 1:   
  119.                         RX[key] = str(int(match.group(2)) - int(RXold[key]))   
  120.                         tempcount = tempcount + 1  
  121.                     elif tempcount == 2:   
  122.                         TX[key] = str(int(match.group(2)) - int(TXold[key]))   
  123.                         divtime = float(int(time.time()) - int(oldtime))   
  124.                         if divtime == 0:   
  125.                             rate = (float(TX[key]) + float(RX[key]))   
  126.                         else:   
  127.                             rate = (float(TX[key]) + float(RX[key]))/(divtime)   
  128.                         if rate == 0:   
  129.                             newnetcard[key] = '0' + ' ' + newnetcard[key]   
  130.                         else:   
  131.                             newnetcard[key] = '%.2f'%rate + ' ' + newnetcard[key]   
  132.         return zip(['order'], ['48']) + newnetcard.items();   
  133.     else:   
  134.         os.system('ifconfig > ifconfigtemp')   
  135.         file = open('./ifconfigtemp','r');   
  136.         fileold = open('./oldifconfig''w')   
  137.         temptimestr = str(int(time.time()));   
  138.         fileold.write(temptimestr)   
  139.         fileold.write('#')   
  140.         file.seek(0)   
  141.         fileold.write(file.read())   
  142.         fileold.close()   
  143.   
  144.         netcard = {}   
  145.         file.seek(0)   
  146.         key = ''  
  147.         for strline in file.readlines():   
  148.             reobj = re.compile('^lo*.')   
  149.             if reobj.search(strline):   
  150.                 break;   
  151.             reobj = re.compile('^eth*.')   
  152.             if reobj.search(strline):   
  153.                 key = strline.split()[0]   
  154.                 netcard[key] = ''  
  155.             netcard[key] = netcard[key] + strline   
  156.         RX = {}   
  157.         TX = {}   
  158.            
  159.         key = ''  
  160.         newnetcard = {}   
  161.         file.seek(0)   
  162.         for strline in file.readlines():   
  163.             reobj = re.compile('^lo*.')   
  164.             if reobj.search(strline):   
  165.                 break;   
  166.             if re.search("^eth", strline):   
  167.                 templist = strline.split()   
  168.                 key = templist[0]   
  169.                 newnetcard[key] = templist[4] + ' '  
  170.             if re.search("^ *inet ", strline):   
  171.                 templist = strline.split()   
  172.                 newnetcard[key] = newnetcard[key] + templist[1][5:] + ' '  
  173.         for key,value in newnetcard.items():   
  174.             os.system('ethtool %s > ethtooltemp'%(key))   
  175.             file = open('./ethtooltemp','r');   
  176.             tempethtooltemplist = file.read().split('\n')   
  177.             file.close   
  178.             if re.search("yes", tempethtooltemplist[-1]):   
  179.                 newnetcard[key] = newnetcard[key] + 'runing!'  
  180.             else:   
  181.                 newnetcard[key] = newnetcard[key] + 'stop!'  
  182.         file.close()   
  183.         for key,value in netcard.items():   
  184.             tempsplit = value.split('\n')   
  185.             netcard[key] = ''  
  186.             for item in tempsplit:   
  187.                 item = item + '<br>'  
  188.                 #print item   
  189.                 netcard[key] = netcard[key] + item   
  190.                 tempcount = 1  
  191.                 for match in re.finditer("(bytes:)(.*?)( \()", item):   
  192.                     if tempcount == 1:   
  193.                         RX[key] = match.group(2)   
  194.                         tempcount = tempcount + 1  
  195.                     elif tempcount == 2:   
  196.                         TX[key] = match.group(2)   
  197.                         netcard[key] = netcard[key] + 'net io percent(bytes/s): 0 <br>'  
  198.                         newnetcard[key] = newnetcard[key] + ' ' + '0 <br>'  
  199.         return zip(['order'], ['48']) + newnetcard.items();   
  200. if __name__ == '__main__':   
  201.     print run()  

 

浣跨敤渚嬪瓙錛?/p>

1 

姣忎竴涓垪琛ㄥ厓绱犲厓緇勯噷闈㈢浜屼釜鍏冪礌絎竴涓瓧孌典負緗戦?Bytes/S錛屼緥濡俥th1緗戝崱鐨勭綉閫熷氨鏄?.3KB/s錛宔th0緗戦熸槸2.9KB/s錛屼粖澶╂槸鍛ㄥ叚榪欎釜嫻侀噺寰堟甯?/p>

]]>
鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙婂畬鍏ㄦ敞閲奫6]鍊掓帓绱㈠紩鐨勫緩绔嬬殑紼嬪簭鍒嗘瀽(4)http://www.shnenglu.com/jrckkyy/archive/2009/12/10/102949.html瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/dc:creator>瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/author>Thu, 10 Dec 2009 15:03:00 GMThttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102949.htmlhttp://www.shnenglu.com/jrckkyy/comments/102949.htmlhttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102949.html#Feedback3http://www.shnenglu.com/jrckkyy/comments/commentRss/102949.htmlhttp://www.shnenglu.com/jrckkyy/services/trackbacks/102949.html浠ヤ笅鏄牴鎹鍚戠儲寮曞緩绔嬪掓帓绱㈠紩鐨勬敞閲?/p>

 

int main(int argc, char* argv[])    //./CrtInvertedIdx moon.fidx.sort > sun.iidx
{
    ifstream ifsImgInfo(argv[1]);
    if (!ifsImgInfo) 
    {
        cerr << "Cannot open " << argv[1] << " for input\n";
        return -1;
    }

    string strLine,strDocNum,tmp1="";
    int cnt = 0;
    while (getline(ifsImgInfo, strLine)) 
    {
        string::size_type idx;
        string tmp;


        idx = strLine.find("\t");
        tmp = strLine.substr(0,idx);

        if (tmp.size()<2 || tmp.size() > 8) continue;

        if (tmp1.empty()) tmp1=tmp;

        if (tmp == tmp1) 
        {
            strDocNum = strDocNum + " " + strLine.substr(idx+1);
        }
        else 
        {
            if ( strDocNum.empty() )
                strDocNum = strDocNum + " " + strLine.substr(idx+1);

            cout << tmp1 << "\t" << strDocNum << endl;
            tmp1 = tmp;
            strDocNum.clear();
            strDocNum = strDocNum + " " + strLine.substr(idx+1);
        }

        cnt++;
        //if (cnt==100) break;
    }
    cout << tmp1 << "\t" << strDocNum << endl;  //鍊掓帓绱㈠紩涓瘡涓瓧鍏稿崟璇嶅悗鐨勬枃妗g紪鍙蜂互table閿負闂撮殧

    return 0;
}

 

 



]]>
鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙婂畬鍏ㄦ敞閲奫6]鍊掓帓绱㈠紩鐨勫緩绔嬬殑紼嬪簭鍒嗘瀽(3) http://www.shnenglu.com/jrckkyy/archive/2009/12/10/102948.html瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/dc:creator>瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/author>Thu, 10 Dec 2009 15:02:00 GMThttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102948.htmlhttp://www.shnenglu.com/jrckkyy/comments/102948.htmlhttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102948.html#Feedback1http://www.shnenglu.com/jrckkyy/comments/commentRss/102948.htmlhttp://www.shnenglu.com/jrckkyy/services/trackbacks/102948.html榪欓噷浠嬬粛姝e悜绱㈠紩鐨勫緩绔嬶紝濡傛灉鐩存帴寤虹珛鍊掓帓绱㈠紩鏁堢巼涓婂彲鑳戒細寰堜綆錛屾墍浠ュ彲浠ュ厛浜х敓姝e悜绱㈠紩涓哄悗闈㈢殑鍊掓帓绱㈠紩鎵撲笅鍩虹銆?/p>

 

璇︾粏鐨勬枃浠跺姛鑳藉拰浠嬬粛閮藉湪榪欓噷鏈変簡浠嬬粛鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙婂畬鍏ㄦ敞閲奫5]鍊掓帓绱㈠紩鐨勫緩绔嬪強鏂囦歡浠嬬粛

 

CrtForwardIdx.cpp鏂囦歡

 

int main(int argc, char* argv[])    //./CrtForwardIdx Tianwang.raw.***.seg > moon.fidx
{
    ifstream ifsImgInfo(argv[1]);
    if (!ifsImgInfo) 
    {
        cerr << "Cannot open " << argv[1] << " for input\n";
        return -1;
    }

    string strLine,strDocNum;
    int cnt = 0;
    while (getline(ifsImgInfo, strLine)) 
    {
        string::size_type idx;

        cnt++;
        if (cnt%2 == 1) //濂囨暟琛屼負鏂囨。緙栧彿
        {
            strDocNum = strLine.substr(0,strLine.size());
            continue;
        }
        if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
        {
            continue;
        }

        while ( (idx = strLine.find(SEPARATOR)) != string::npos ) //鎸囧畾鏌ユ壘鍒嗙晫絎?
        {
            string tmp1 = strLine.substr(0,idx);
            cout << tmp1 << "\t" << strDocNum << endl;
            strLine = strLine.substr(idx + SEPARATOR.size());
        }

        //if (cnt==100) break;
    }

    return 0;
}

 

author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

 

 



]]>
鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙婂畬鍏ㄦ敞閲奫6]鍊掓帓绱㈠紩鐨勫緩绔嬬殑紼嬪簭鍒嗘瀽(2)http://www.shnenglu.com/jrckkyy/archive/2009/12/10/102947.html瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/dc:creator>瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/author>Thu, 10 Dec 2009 15:02:00 GMThttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102947.htmlhttp://www.shnenglu.com/jrckkyy/comments/102947.htmlhttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102947.html#Feedback1http://www.shnenglu.com/jrckkyy/comments/commentRss/102947.htmlhttp://www.shnenglu.com/jrckkyy/services/trackbacks/102947.html鍓嶉潰鐨凞ocIndex紼嬪簭杈撳叆涓涓猅ianwang.raw.*****鏂囦歡錛屼細浜х敓涓涓嬩笁涓枃浠?Doc.idx, Url.idx, DocId2Url.idx錛屾垜浠繖閲屽DocSegment紼嬪簭榪涜鍒嗘瀽銆?/p>

榪欓噷杈撳叆 Tianwang.raw.*****錛孌oc.idx錛孶rl.idx.sort_uniq絳変笁涓枃浠訛紝杈撳嚭涓涓猅ianwang.raw.***.seg 鍒嗚瘝瀹屾瘯鐨勬枃浠?/p>

int main(int argc, char* argv[])
{
    string strLine, strFileName=argv[1];
    CUrl iUrl;
    vector<CUrl> vecCUrl;
    CDocument iDocument;
    vector<CDocument> vecCDocument;
    unsigned int docId = 0;

    //ifstream ifs("Tianwang.raw.2559638448");
    ifstream ifs(strFileName.c_str());  //DocSegment Tianwang.raw.****
    if (!ifs) 
    {
        cerr << "Cannot open tianwang.img.info for input\n";
        return -1;
    }

    ifstream ifsUrl("Url.idx.sort_uniq");   //鎺掑簭騫舵秷閲嶅悗鐨剈rl瀛楀吀
    if (!ifsUrl) 
    {
        cerr << "Cannot open Url.idx.sort_uniq for input\n";
        return -1;
    }
    ifstream ifsDoc("Doc.idx"); //瀛楀吀鏂囦歡
    if (!ifsDoc) 
    {
        cerr << "Cannot open Doc.idx for input\n";
        return -1;
    }

    while (getline(ifsUrl,strLine)) //鍋忕url瀛楀吀瀛樺叆涓涓悜閲忓唴瀛樹腑
    {
        char chksum[33];
        int  docid;

        memset(chksum, 0, 33);
        sscanf( strLine.c_str(), "%s%d", chksum, &docid );
        iUrl.m_sChecksum = chksum;
        iUrl.m_nDocId = docid;
        vecCUrl.push_back(iUrl);
    }

    while (getline(ifsDoc,strLine))     //鍋忕瀛楀吀鏂囦歡灝嗗叾鏀懼叆涓涓悜閲忓唴瀛樹腑
    {
        int docid,pos,length;
        char chksum[33];

        memset(chksum, 0, 33);
        sscanf( strLine.c_str(), "%d%d%d%s", &docid, &pos, &length,chksum );
        iDocument.m_nDocId = docid;
        iDocument.m_nPos = pos;
        iDocument.m_nLength = length;
        iDocument.m_sChecksum = chksum;
        vecCDocument.push_back(iDocument);
    }

 

    strFileName += ".seg";
    ofstream fout(strFileName.c_str(), ios::in|ios::out|ios::trunc|ios::binary);    //璁劇疆瀹屾垚鍒嗚瘝鍚庣殑鏁版嵁杈撳嚭鏂囦歡
    for ( docId=0; docId<MAX_DOC_ID; docId++ )
    {

        // find document according to docId
        int length = vecCDocument[docId+1].m_nPos - vecCDocument[docId].m_nPos -1;
        char *pContent = new char[length+1];
        memset(pContent, 0, length+1);
        ifs.seekg(vecCDocument[docId].m_nPos);
        ifs.read(pContent, length);

        char *s;
        s = pContent;

        // skip Head
        int bytesRead = 0,newlines = 0;
        while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1) 
        {
            if (*s == '\n')
                newlines++;
            else
                newlines = 0;
            s++;
            bytesRead++;
        }
        if (bytesRead == HEADER_BUF_SIZE-1) continue;


        // skip header
        bytesRead = 0,newlines = 0;
        while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1) 
        {
            if (*s == '\n')
                newlines++;
            else
                newlines = 0;
            s++;
            bytesRead++;
        }
        if (bytesRead == HEADER_BUF_SIZE-1) continue;

        //iDocument.m_sBody = s;
        iDocument.RemoveTags(s);    //鍘婚櫎<>
        iDocument.m_sBodyNoTags = s;

        delete[] pContent;
        string strLine = iDocument.m_sBodyNoTags;

        CStrFun::ReplaceStr(strLine, " ", " ");
        CStrFun::EmptyStr(strLine); // set " \t\r\n" to " "


        // segment the document 鍏蜂綋鍒嗚瘝澶勭悊
        CHzSeg iHzSeg;
        strLine = iHzSeg.SegmentSentenceMM(iDict,strLine);
        fout << docId << endl << strLine;
        fout << endl;
        
    }

    return(0);
}
榪欓噷鍙槸嫻厜鎺犲獎寮忕殑榪囦竴閬嶅ぇ姒傜殑浠g爜錛屽悗闈㈡垜浼氭湁涓撻璇︾粏璁茶В parse html 鍜?segment docment 絳夋妧鏈?/p>

 

 



]]>
鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙婂畬鍏ㄦ敞閲奫6]鍊掓帓绱㈠紩鐨勫緩绔嬬殑紼嬪簭鍒嗘瀽(1)http://www.shnenglu.com/jrckkyy/archive/2009/12/10/102945.html瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/dc:creator>瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/author>Thu, 10 Dec 2009 15:00:00 GMThttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102945.htmlhttp://www.shnenglu.com/jrckkyy/comments/102945.htmlhttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102945.html#Feedback1http://www.shnenglu.com/jrckkyy/comments/commentRss/102945.htmlhttp://www.shnenglu.com/jrckkyy/services/trackbacks/102945.htmlauthor:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

涓婁竴綃囦富瑕佷粙緇嶄簡鍊掓帓绱㈠紩寤虹珛鐩稿叧鐨勬枃浠跺強涓棿鏂囦歡銆?br>TSE寤虹珛绱㈠紩鍦ㄨ繍琛岀▼搴忎笂鐨勫ぇ鑷存楠ゅ彲浠ョ畝鍖栧垎涓轟互涓嬪嚑姝ワ細

1銆佽繍琛屽懡浠?./DocIndex
浼氱敤鍒頒竴涓枃浠?tianwang.raw.520    //鐖彇鍥炴潵鐨勫師濮嬫枃浠訛紝鍖呭惈澶氫釜緗戦〉鐨勬墍鏈変俊鎭紝鎵浠ュ緢澶э紝榪欎篃鏄竴涓湁寰呰В鍐崇殑闂錛屽埌搴曞瓨鎴愬ぇ鏂囦歡錛堝鏋滆繃澶т細瓚呰繃2G鎴?G鐨勯檺鍒訛紝鑰屼笖鏂囦歡榪囧ぇ绱㈠紩鏁堢巼榪囦綆錛夎繕鏄皬鏂囦歡錛堟枃浠舵暟榪囧鐢ㄤ簬鎵撳紑鍏抽棴鏂囦歡鍙ユ焺鐨勬秷鑰楄繃澶э級榪樻湁寰呮濊冿紝榪樺氨鏄瓨鍌ㄦ柟妗堢殑瑙e喅鏈緇堣偗瀹氭槸瑕佸瓨涓哄垎甯冨紡鐨勶紝鏈緇堟繪枃浠墮噺鑲畾鏄細涓奣B鐨勶紝TSE鍙敮鎸佸皬鍨嬬殑鎼滅儲寮曟搸闇姹傘?nbsp;         
浼氫駭鐢熶竴涓嬩笁涓枃浠?Doc.idx, Url.idx, DocId2Url.idx    //Data鏂囦歡澶逛腑鐨凞oc.idx DocId2Url.idx鍜孌oc.idx

2銆佽繍琛屽懡浠?sort Url.idx|uniq > Url.idx.sort_uniq    //Data鏂囦歡澶逛腑鐨刄rl.idx.sort_uniq
浼氱敤鍒頒竴涓枃浠?Url.idx鏂囦歡 //md5 hash 涔嬪悗鐨剈rl瀹屾暣鍦板潃鍜宒ocument id鍊煎
浼氫駭鐢熶竴涓枃浠?Url.idx.sort_uniq //URL娑堥噸錛宮d5 hash鎺掑簭錛屾彁楂樻绱㈡晥鐜?/p>

3銆佽繍琛屽懡浠?./DocSegment Tianwang.raw.2559638448 
浼氱敤鍒頒竴涓枃浠?Tianwang.raw.2559638448  //Tianwang.raw.2559638448涓虹埇鍥炴潵鐨勬枃浠?錛屾瘡涓〉闈㈠寘鍚玥ttp澶達紝鍒嗚瘝涓哄悗闈㈠緩绔嬪埌鎺掔儲寮曞仛鍑嗗
浼氫駭鐢熶竴涓枃浠?Tianwang.raw.2559638448.seg //鍒嗚瘝鏂囦歡錛岀敱涓琛宒ocument id鍙峰拰涓琛屾枃妗e垎璇嶇粍錛堝彧瀵規瘡涓枃妗?lt;html></html>涓?lt;head></head><body></body>絳夋枃瀛楁爣璁頒腑鐨勬枃鏈繘琛屽垎緇勶級鏋勬垚

4銆佽繍琛屽懡浠?./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx //寤虹珛鐙珛鐨勬鍚戠儲寮?/p>

5銆佽繍琛屽懡浠?br>#set | grep "LANG"
#LANG=en; export LANG;
#sort moon.fidx > moon.fidx.sort

6銆佽繍琛屽懡浠?./CrtInvertedIdx moon.fidx.sort > sun.iidx //寤虹珛鍊掓帓绱㈠紩

鎴戜滑鍏堜粠寤虹珛绱㈠紩鐨勭涓涓▼搴廌ocIndex.cpp寮濮嬪垎鏋愩?娉ㄩ噴綰﹀畾錛歍ianwang.raw.2559638448鏄姄鍥炴潵鍚堝茍鎴愮殑澶ф枃浠訛紝鍚庨潰灝卞彨澶ф枃浠訛紝閲岄潰鍖呭惈浜嗗緢澶氱瘒html鏂囨。錛岄噷闈㈢殑鏂囨。鏈夎寰嬬殑鍒嗛殧灝卞彨鍋氫竴綃囦竴綃囩殑鏂囨。)


//DocIndex.h start-------------------------------------------------------------

 


#ifndef _COMM_H_040708_
#define _COMM_H_040708_

#include

#include
#include
#include
#include
#include
#include
#include


using namespace std;

const unsigned HEADER_BUF_SIZE = 1024;
const unsigned RstPerPage = 20; //鍓嶅彴鎼滅儲緇撴灉鏁版嵁闆嗚繑鍥炴潯鏁?/p>

//iceway
//const unsigned MAX_DOC_IDX_ID = 21312;  //DocSegment.cpp涓鐢ㄥ埌
const unsigned MAX_DOC_IDX_ID = 22104;


//const string IMG_INFO_NAME("./Data/s1.1");
const string INF_INFO_NAME("./Data/sun.iidx"); //鍊掓帓绱㈠紩鏂囦歡
//鏈卞痙  14383 16151 16151 16151 1683 207 6302 7889 8218 8218 8637
//鏈卞彜鍔?nbsp; 1085 1222

//9涓囧鏉?瀛楀厓鏂囦歡 鍖呮嫭鐗規畩絎﹀彿錛屾爣鐐癸紝姹夊瓧
const string DOC_IDX_NAME("./Data/Doc.idx"); //鍊掓帓绱㈠紩鏂囦歡
const string RAWPAGE_FILE_NAME("./Data/Tianwang.swu.iceway.1.0");

//iceway
const string DOC_FILE_NAME = "Tianwang.swu.iceway.1.0";  //Docindex.cpp涓鐢ㄥ埌
const string Data_DOC_FILE_NAME = "./Data/Tianwang.swu.iceway.1.0";  //Snapshot.cpp涓鐢ㄥ埌


//const string RM_THUMBNAIL_FILES("rm -f ~/public_html/ImgSE/timg/*");

//const string THUMBNAIL_DIR("/ImgSE/timg/");


#endif _COMM_H_040708_
//DocIndex.h end--------------------------------------------------------------//DocIndex.cpp start-----------------------------------------------------------

#include
#include
#include "Md5.h"
#include "Url.h"
#include "Document.h"

//iceway(mnsc)
#include "Comm.h"
#include

using namespace std;

int main(int argc, char* argv[])
{
    //ifstream ifs("Tianwang.raw.2559638448");
 //ifstream ifs("Tianwang.raw.3023555472");
 //iceway(mnsc)
 ifstream ifs(DOC_FILE_NAME.c_str()); //鎵撳紑Tianwang.raw.3023555472鏂囦歡錛屾渶鍘熷鐨勬枃浠?br> if (!ifs)
 {
     cerr << "Cannot open " << "tianwang.img.info" << " for input\n";
     return -1;
    }
 ofstream ofsUrl("Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //寤虹珛騫舵墦寮Url.idx鏂囦歡
 if( !ofsUrl )
 {
  cout << "error open file " << endl;
 }

 ofstream ofsDoc("Doc.idx", ios::in|ios::out|ios::trunc|ios::binary); //寤虹珛騫舵墦寮Doc.idx鏂囦歡
 if( !ofsDoc )
 {
  cout << "error open file " << endl;
 }

 ofstream ofsDocId2Url("DocId2Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //寤虹珛騫舵墦寮DocId2Url.idx鏂囦歡
 if( !ofsDocId2Url )
 {
  cout << "error open file " << endl;
 }

 int cnt=0; //鏂囨。緙栧彿浠?寮濮嬭綆?br> string strLine,strPage;
 CUrl iUrl;
 CDocument iDocument;
 CMD5 iMD5;
 
 int nOffset = ifs.tellg();
 while (getline(ifs, strLine))
 {
  if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
  {
   nOffset = ifs.tellg();
   continue;
  }

  if (!strncmp(strLine.c_str(), "version: 1.0", 12)) //鍒ゆ柇絎竴琛屾槸鍚︽槸version: 1.0濡傛灉鏄氨瑙f瀽涓嬪幓
  { 
   if(!getline(ifs, strLine)) break;
   if (!strncmp(strLine.c_str(), "url: ", 4)) //鍒ゆ柇絎簩琛屾槸鍚︽槸url: 濡傛灉鏄垯瑙f瀽涓嬪幓
   {
    iUrl.m_sUrl = strLine.substr(5); //鎴彇url: 浜斾釜瀛楃涔嬪悗鐨剈rl鍐呭
    iMD5.GenerateMD5( (unsigned char*)iUrl.m_sUrl.c_str(), iUrl.m_sUrl.size() ); //瀵箄rl鐢╩d5 hash澶勭悊
    iUrl.m_sChecksum = iMD5.ToString(); //灝嗗瓧絎︽暟緇勭粍鍚堟垚瀛楃涓茶繖涓嚱鏁板湪Md5.h涓疄鐜?/p>

   } else
   {
    continue;
   }

   while (getline(ifs, strLine))
   {
    if (!strncmp(strLine.c_str(), "length: ", 8)) //涓鐩磋涓嬪幓鐩村埌鍒ゆ柇婢規竟(鐩稿絎簲琛?鎯烘瑺琚瘋帒ength: 鏄垯鎺ヤ笅涓嬪幓
    {
     sscanf(strLine.substr(8).c_str(), "%d", &(iDocument.m_nLength)); //灝嗚鍧楁墍浠h〃緗戦〉鐨勫疄闄呯綉欏靛唴瀹歸暱搴︽斁鍏Document鏁版嵁緇撴瀯涓?br>     break;
    }
   }

   getline(ifs, strLine); //璺寵繃鐩稿絎叚琛屾晠鎰忕暀鐨勪竴涓┖琛?/p>

   iDocument.m_nDocId = cnt; //灝嗘枃妗g紪鍙瘋祴鍊煎埌iDocument鏁版嵁緇撴瀯涓?br>   iDocument.m_nPos = nOffset; //鏂囨。緇撳熬鍦ㄥぇ鏂囦歡涓殑緇撴潫琛屽彿
   char *pContent = new char[iDocument.m_nLength+1]; //鏂板緩璇ユ枃妗i暱搴︾殑瀛楃涓叉寚閽?/p>

   memset(pContent, 0, iDocument.m_nLength+1); //姣忎竴浣嶅垵濮嬪寲涓?
   ifs.read(pContent, iDocument.m_nLength); //鏍規嵁鑾峰緱鐨勬枃妗i暱搴﹁鍙栨竟(鍏朵腑鍖呭惈鍗忚澶?璇誨彇鏂囨。鍐呭
   iMD5.GenerateMD5( (unsigned char*)pContent, iDocument.m_nLength );
   iDocument.m_sChecksum = iMD5.ToString(); //灝嗗瓧絎︽暟緇勭粍鍚堟垚瀛楃涓茶繖涓嚱鏁板湪Md5.h涓疄鐜?br>   
   delete[] pContent;
   
   ofsUrl << iUrl.m_sChecksum ; //灝唌d5hash鍚庣殑url鍐欏叆Url.idx鏂囦歡
   ofsUrl << "\t" << iDocument.m_nDocId << endl; //鍦ㄤ竴琛屼腑涓涓猼ab璺濈鍒嗛殧錛屽皢鏂囦歡緙栧彿鍐欏叆Url.idx鏂囦歡

   ofsDoc << iDocument.m_nDocId ; //灝嗘枃浠剁紪鍙峰啓鍏oc.idx鏂囦歡
   ofsDoc << "\t" << iDocument.m_nPos ; //鍦ㄤ竴琛屼腑涓涓猼ab璺濈鍒嗛殧錛屽皢璇ユ枃妗g粨鏉熻鍙鋒竟(鍚屾牱涔熸槸涓嬩竴鏂囨。寮濮嬭鍙?鍐欏叆Doc.idx鏂囦歡
   //ofsDoc << "\t" << iDocument.m_nLength ;
   ofsDoc << "\t" << iDocument.m_sChecksum << endl; //鍦ㄤ竴琛屼腑涓涓猼ab璺濈鍒嗛殧錛屽皢md5hash鍚庣殑url鍐欏叆Doc.idx鏂囦歡

   ofsDocId2Url << iDocument.m_nDocId ; //灝嗘枃浠剁紪鍙峰啓鍏ocId2Url.idx鏂囦歡
   ofsDocId2Url << "\t" << iUrl.m_sUrl << endl; //灝嗚鏂囨。鐨勫畬鏁磚rl鍐欏叆DocId2Url.idx鏂囦歡

   cnt++; //鏂囨。緙栧彿鍔犱竴璇存槑璇ヤ互鏂囨。鍒嗘瀽瀹屾瘯錛岀敓鎴愪笅涓鏂囨。鐨勭紪鍙?br>  }

  nOffset = ifs.tellg();

 }

 //鏈鍚庝竴琛屽彧鏈夋枃妗e彿鍜屼笂涓綃囨枃妗g粨鏉熷彿
 ofsDoc << cnt ;
 ofsDoc << "\t" << nOffset << endl;


 return(0);
}

//DocIndex.cpp end-----------------------------------------------------------author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

 

 



]]>
鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙婂畬鍏ㄦ敞閲奫5]鍊掓帓绱㈠紩鐨勫緩绔嬪強鏂囦歡浠嬬粛http://www.shnenglu.com/jrckkyy/archive/2009/12/10/102943.html瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/dc:creator>瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/author>Thu, 10 Dec 2009 14:55:00 GMThttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102943.htmlhttp://www.shnenglu.com/jrckkyy/comments/102943.htmlhttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102943.html#Feedback1http://www.shnenglu.com/jrckkyy/comments/commentRss/102943.htmlhttp://www.shnenglu.com/jrckkyy/services/trackbacks/102943.html涓嶅ソ鎰忔濊澶у涔呯瓑浜嗭紝鍓嶄竴闃典竴鐩村湪蹇欒冭瘯錛岀粓浜庣粨鏉熶簡銆傚懙鍛碉紒搴熻瘽涓嶅璇翠簡涓嬮潰鎴戜滑寮濮嬪惂錛?/p>

TSE鐢ㄧ殑鏄皢鎶撳彇鍥炴潵鐨勭綉欏墊枃妗e叏閮ㄨ鍏ヤ竴涓ぇ鏂囨。錛岃鍚庡榪欎竴涓ぇ鏂囨。鍐呯殑鏁版嵁鏁翠綋緇熶竴鐨勫緩绱㈠紩錛屽叾涓寘鍚簡鍑犱釜姝ラ銆?/p>

view plaincopy to clipboardprint?
1.  The document index (Doc.idx) keeps information about each document.  
 
It is a fixed width ISAM (Index sequential access mode) index, orderd by docID.  
 
The information stored in each entry includes a pointer into the repository,  
 
a document length, a document checksum.  
 
 
 
//Doc.idx  鏂囨。緙栧彿 鏂囨。闀垮害    checksum hash鐮?nbsp; 
 
0   0   bc9ce846d7987c4534f53d423380ba70  
 
1   76760   4f47a3cad91f7d35f4bb6b2a638420e5  
 
2   141624  d019433008538f65329ae8e39b86026c  
 
3   142350  5705b8f58110f9ad61b1321c52605795  
 
//Doc.idx   end  
 
 
 
  The url index (url.idx) is used to convert URLs into docIDs.  
 
 
 
//url.idx  
 
5c36868a9c5117eadbda747cbdb0725f    0 
 
3272e136dd90263ee306a835c6c70d77    1 
 
6b8601bb3bb9ab80f868d549b5c5a5f3    2 
 
3f9eba99fa788954b5ff7f35a5db6e1f    3 
 
//url.idx   end  
 
 
 
It is a list of URL checksums with their corresponding docIDs and is sorted by  
 
checksum. In order to find the docID of a particular URL, the URL's checksum  
 
is computed and a binary search is performed on the checksums file to find its  
 
docID.  
 
 
 
    ./DocIndex  
 
        got Doc.idx, Url.idx, DocId2Url.idx //Data鏂囦歡澶逛腑鐨凞oc.idx DocId2Url.idx鍜孌oc.idx涓?nbsp; 
 
 
 
//DocId2Url.idx  
 
0   http://*.*.edu.cn/index.aspx  
 
1   http://*.*.edu.cn/showcontent1.jsp?NewsID=118  
 
2   http://*.*.edu.cn/0102.html  
 
3   http://*.*.edu.cn/0103.html  
 
//DocId2Url.idx end  
 
 
 
2.  sort Url.idx|uniq > Url.idx.sort_uniq    //Data鏂囦歡澶逛腑鐨刄rl.idx.sort_uniq  
 
 
 
//Url.idx.sort_uniq  
 
//瀵筯ash鍊艱繘琛屾帓搴?nbsp; 
 
000bfdfd8b2dedd926b58ba00d40986b    1111 
 
000c7e34b653b5135a2361c6818e48dc    1831 
 
0019d12f438eec910a06a606f570fde8    366 
 
0033f7c005ec776f67f496cd8bc4ae0d    2103 
 
 
 
3. Segment document to terms, (with finding document according to the url)  
 
    ./DocSegment Tianwang.raw.2559638448        //Tianwang.raw.2559638448涓虹埇鍥炴潵鐨勬枃浠?錛屾瘡涓〉闈㈠寘鍚玥ttp澶?nbsp; 
 
        got Tianwang.raw.2559638448.seg       
 
 
 
//Tianwang.raw.2559638448   鐖彇鐨勫師濮嬬綉欏墊枃浠跺湪鏂囨。鍐呴儴姣忎竴涓枃妗d箣闂村簲璇ユ槸閫氳繃version錛?lt;/html>鍜屽洖杞﹀仛鏍囧織浣嶅垎鍓茬殑  
 
version: 1.0 
 
url: http://***.105.138.175/Default2.asp?lang=gb  
 
origin: http://***.105.138.175/  
 
date: Fri, 23 May 2008 20:01:36 GMT  
 
ip: 162.105.138.175 
 
length: 38413 
 
 
 
HTTP/1.1 200 OK  
 
Server: Microsoft-IIS/5.0 
 
Date: Fri, 23 May 2008 11:17:49 GMT  
 
Connection: keep-alive  
 
Connection: Keep-Alive  
 
Content-Length: 38088 
 
Content-Type: text/html; Charset=gb2312  
 
Expires: Fri, 23 May 2008 11:17:49 GMT  
 
Set-Cookie: ASPSESSIONIDSSTRDCAB=IMEOMBIAIPDFCKPAEDJFHOIH; path=/  
 
Cache-control: private 
 
 
 
 
 
 
 
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 
 
" 
<html>  
 
<head>  
 
<title>Apabi鏁板瓧璧勬簮騫沖彴</title>  
 
<meta http-equiv="Content-Type" content="text/html; charset=gb2312">  
 
<META NAME="ROBOTS" CONTENT="INDEX,NOFOLLOW">  
 
<META NAME="DESCRIPTION" CONTENT="鏁板瓧鍥句功棣?鏂規鏁板瓧鍥句功棣?鐢靛瓙鍥句功 鐢靛瓙涔?ebook e涔?Apabi 鏁板瓧璧勬簮騫沖彴">  
 
<link rel="stylesheet" type="text/css" href="css\common.css">  
 
 
 
<style type="text/css">  
 
<!--  
 
.style4 {color: #666666}  
 
-->  
 
</style>  
 
 
 
<script LANGUAGE="vbscript">  
 
...  
 
</script>  
 
 
 
<Script Language="javascript">  
 
...  
 
</Script>  
 
</head>  
 
<body leftmargin="0" topmargin="0">  
 
</body>  
 
</html>  
 
//Tianwang.raw.2559638448   end  
 
 
 
//Tianwang.raw.2559638448.seg   灝嗘瘡涓〉闈㈠垎鎴愪竴琛屽涓?娉ㄦ剰涓棿娌℃湁鍥炶濺浣滀負鍒嗛殧)  
 

 
...  
 
...  
 
...  
 

 
...  
 
...  
 
...  
 
//Tianwang.raw.2559638448.seg   end  
 
 
 
//涓嬫槸 Tiny search 闈炲繀欏誨洜绱?nbsp; 
 
4. Create forward index (docic-->termid)     //寤虹珛姝e悜绱㈠紩  
 
    ./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx  
 
 
 
//Tianwang.raw.2559638448.seg 灝嗘瘡涓〉闈㈠垎鎴愪竴琛屽涓?lt;BR>//鍒嗚瘝   DocID<BR>1<BR>涓夋槦/  s/  鎵嬫満/  璁哄潧/  ,/  鎵嬫満/  閾冨0/  涓嬭澆/  ,/  鎵嬫満/  鍥劇墖/  涓嬭澆/  ,/  鎵嬫満/<BR>2<BR>...<BR>...<BR>... 

1.  The document index (Doc.idx) keeps information about each document.

It is a fixed width ISAM (Index sequential access mode) index, orderd by docID.

The information stored in each entry includes a pointer into the repository,

a document length, a document checksum.

 

//Doc.idx  鏂囨。緙栧彿 鏂囨。闀垮害 checksum hash鐮?/p>

0 0 bc9ce846d7987c4534f53d423380ba70

1 76760 4f47a3cad91f7d35f4bb6b2a638420e5

2 141624 d019433008538f65329ae8e39b86026c

3 142350 5705b8f58110f9ad61b1321c52605795

//Doc.idx end

 

  The url index (url.idx) is used to convert URLs into docIDs.

 

//url.idx

5c36868a9c5117eadbda747cbdb0725f 0

3272e136dd90263ee306a835c6c70d77 1

6b8601bb3bb9ab80f868d549b5c5a5f3 2

3f9eba99fa788954b5ff7f35a5db6e1f 3

//url.idx end

 

It is a list of URL checksums with their corresponding docIDs and is sorted by

checksum. In order to find the docID of a particular URL, the URL's checksum

is computed and a binary search is performed on the checksums file to find its

docID.

 

 ./DocIndex

  got Doc.idx, Url.idx, DocId2Url.idx //Data鏂囦歡澶逛腑鐨凞oc.idx DocId2Url.idx鍜孌oc.idx涓?/p>

 

//DocId2Url.idx

http://*.*.edu.cn/index.aspx

http://*.*.edu.cn/showcontent1.jsp?NewsID=118

http://*.*.edu.cn/0102.html

http://*.*.edu.cn/0103.html

//DocId2Url.idx end

 

2.  sort Url.idx|uniq > Url.idx.sort_uniq //Data鏂囦歡澶逛腑鐨刄rl.idx.sort_uniq

 

//Url.idx.sort_uniq

//瀵筯ash鍊艱繘琛屾帓搴?/p>

000bfdfd8b2dedd926b58ba00d40986b 1111

000c7e34b653b5135a2361c6818e48dc 1831

0019d12f438eec910a06a606f570fde8 366

0033f7c005ec776f67f496cd8bc4ae0d 2103

 

3. Segment document to terms, (with finding document according to the url)

 ./DocSegment Tianwang.raw.2559638448  //Tianwang.raw.2559638448涓虹埇鍥炴潵鐨勬枃浠?錛屾瘡涓〉闈㈠寘鍚玥ttp澶?/p>

  got Tianwang.raw.2559638448.seg  

 

//Tianwang.raw.2559638448 鐖彇鐨勫師濮嬬綉欏墊枃浠跺湪鏂囨。鍐呴儴姣忎竴涓枃妗d箣闂村簲璇ユ槸閫氳繃version錛?lt;/html>鍜屽洖杞﹀仛鏍囧織浣嶅垎鍓茬殑

version: 1.0

url: http://***.105.138.175/Default2.asp?lang=gb

origin: http://***.105.138.175/

date: Fri, 23 May 2008 20:01:36 GMT

ip: 162.105.138.175

length: 38413

 

HTTP/1.1 200 OK

Server: Microsoft-IIS/5.0

Date: Fri, 23 May 2008 11:17:49 GMT

Connection: keep-alive

Connection: Keep-Alive

Content-Length: 38088

Content-Type: text/html; Charset=gb2312

Expires: Fri, 23 May 2008 11:17:49 GMT

Set-Cookie: ASPSESSIONIDSSTRDCAB=IMEOMBIAIPDFCKPAEDJFHOIH; path=/

Cache-control: private

 

 

 

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"

"

<html>

<head>

<title>Apabi鏁板瓧璧勬簮騫沖彴</title>

<meta http-equiv="Content-Type" content="text/html; charset=gb2312">

<META NAME="ROBOTS" CONTENT="INDEX,NOFOLLOW">

<META NAME="DESCRIPTION" CONTENT="鏁板瓧鍥句功棣?鏂規鏁板瓧鍥句功棣?鐢靛瓙鍥句功 鐢靛瓙涔?ebook e涔?Apabi 鏁板瓧璧勬簮騫沖彴">

<link rel="stylesheet" type="text/css" href="css\common.css">

 

<style type="text/css">

<!--

.style4 {color: #666666}

-->

</style>

 

<script LANGUAGE="vbscript">

...

</script>

 

<Script Language="javascript">

...

</Script>

</head>

<body leftmargin="0" topmargin="0">

</body>

</html>

//Tianwang.raw.2559638448 end

 

//Tianwang.raw.2559638448.seg 灝嗘瘡涓〉闈㈠垎鎴愪竴琛屽涓?娉ㄦ剰涓棿娌℃湁鍥炶濺浣滀負鍒嗛殧)

1

...

...

...

2

...

...

...

//Tianwang.raw.2559638448.seg end

 

//涓嬫槸 Tiny search 闈炲繀欏誨洜绱?/p>

4. Create forward index (docic-->termid)  //寤虹珛姝e悜绱㈠紩

 ./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx

 

//Tianwang.raw.2559638448.seg 灝嗘瘡涓〉闈㈠垎鎴愪竴琛屽涓?/鍒嗚瘝   DocID1涓夋槦/  s/  鎵嬫満/  璁哄潧/  ,/  鎵嬫満/  閾冨0/  涓嬭澆/  ,/  鎵嬫満/  鍥劇墖/  涓嬭澆/  ,/  鎵嬫満/2.........view plaincopy to clipboardprint?
//Tianwang.raw.2559638448.seg end  
 
 
//moon.fidx  
 
//姣忕瘒鏂囨。鍙峰搴旀枃妗e唴鍒嗗嚭鏉ョ殑    鍒嗚瘝  DocID  
 
閮戒細  2391 
 
浣?nbsp;  2391 
 
閭d簺  2391 
 
鎷ユ湁  2391 
 
瀹?nbsp;  2391 
 
鐨?nbsp;  2391 
 
浜?nbsp;  2391 
 
鐨?nbsp;  2391 
 
瑙嗛噹  2391 
 
鍙?nbsp;  2391 
 
紿?nbsp;  2391 
 
鍦?nbsp;  2180 
 
鐮旂┒鐢熼儴    2180 
 
涓婚〉  2180 
 
鍩瑰吇  2180 
 
綆$悊  2180 
 
鏍忕洰  2180 
 
涓嬭澆  2180 
 
錛?nbsp;  2180 
 
銆?nbsp;  2180 
 
鍏充簬  2180 
 
鍋氬ソ  2180 
 
騫?nbsp;  2180 
 
鍥藉  2180 
 
鍏淳  2180 
 
鐮旂┒鐢?2180 
 
欏圭洰  2180 
 
//moon.fidx end  
 
 
 
5.# set | grep "LANG" 
 
LANG=en; export LANG;  
 
sort moon.fidx > moon.fidx.sort  
 
 
 
6. Create inverted index (termid-->docid)    //寤虹珛鍊掓帓绱㈠紩  
 
    ./CrtInvertedIdx moon.fidx.sort > sun.iidx  
 
 
 
//sun.iidx  //鏂囦歡瑙勬ā澶ф鍑忓皯1/2  
 
鑺卞伐   236 
 
鑺辨搗   2103 
 
鑺卞崏   1018 1061 1061 1061 1730 1730 1730 1730 1730 1852 949 949 
 
鑺辮暰   447 447 
 
鑺辨湪   1061 
 
鑺卞憿   1430 
 
鑺辨湡   447 447 447 447 447 525 
 
鑺遍挶   174 236 
 
鑺辮壊   1730 1730 
 
鑺辮壊鍝佺     1660 
 
鑺辯敓   450 526 
 
鑺卞紡   1428 1430 1430 1430 
 
鑺辯汗   1430 1430 
 
鑺卞簭   447 447 447 447 447 450 
 
鑺辯誕   136 137 
 
鑺辮娊   450 450 
 
//sun.iidx  end  
 
 
 
TSESearch   CGI program for query  
 
Snapshot    CGI program for page snapshot  
 
 
<P>  
author:http://hi.baidu.com/jrckkyy  
 
author:http://blog.csdn.net/jrckkyy  
</P> 

 



]]>
鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙婂畬鍏ㄦ敞閲奫4]灝忕粨http://www.shnenglu.com/jrckkyy/archive/2009/12/10/102942.html瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/dc:creator>瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/author>Thu, 10 Dec 2009 14:54:00 GMThttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102942.htmlhttp://www.shnenglu.com/jrckkyy/comments/102942.htmlhttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102942.html#Feedback0http://www.shnenglu.com/jrckkyy/comments/commentRss/102942.htmlhttp://www.shnenglu.com/jrckkyy/services/trackbacks/102942.html閫氳繃鍓嶉潰鐨勪笁綃囨枃绔犵浉淇′綘宸茬粡瀵圭縐樼殑鎼滅儲寮曟搸鏈変簡涓涓劅鎬х殑璁よ瘑錛屽拰鏅氱殑php綾諱技鐨勮剼鏈璦鏈嶅姟鍣ㄧ被浼鹼紝閫氳繃鑾峰彇鍓嶅彴鍏抽敭瀛楋紝閫氳繃瀛楀吀鍒嗚瘝錛屽拰浜嬪厛寤虹珛寤虹珛濂界殑鍊掓帓绱㈠紩榪涜鐩稿叧鎬у垎鏋愶紝寰楀嚭鏌ヨ緇撴瀯鏍煎紡鍖栬緭鍑虹粨鏋溿傝岃繖閲岀殑鎶鏈毦鐐瑰湪浜?/p>

1銆佸瓧鍏哥殑閫夊彇錛堜簨瀹炰笂鏍規嵁涓嶅悓鏃朵唬涓嶅悓鍦版柟浜轟滑鐨勮璦涔犳儻鏄笉涓鏍風殑鎵浠ヨ瀛楀吀鐨勬渶灝忓厓鐨勫彇鍊兼槸涓嶅悓鐨勶級

2銆佸掓帓绱㈠紩鐨勫緩绔嬶紙榪欓噷灝辮娑夊強鍒扮埇铏殑鎶撳彇鍜岀儲寮曠殑寤虹珛鍚庨潰灝嗛噸鐐逛粙緇嶈繖2鐐癸紝鎼滅儲寮曟搸鐨勬晥鐜囧拰鏈嶅姟璐ㄩ噺瀹炴晥鎬х摱棰堝湪榪欓噷錛?/p>

3銆佺浉鍏蟲у垎鏋愶紙瀵規姄鍥炴潵鐨勬枃妗e垎璇嶅緩绱㈠紩鍜岀敤鎴峰叧閿瓧鍒嗚瘝綆楁硶涓婅瀵瑰簲錛?/p>

鍚庨潰鏂囩珷浼氶噸鐐逛粙緇嶇埇铏殑鎶撳彇鍜岀儲寮曠殑寤虹珛銆?/p>

]]>
鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙婂畬鍏ㄦ敞閲奫3]鏉ュ埌鍏抽敭瀛楀垎璇嶅強鐩稿叧鎬у垎鏋愮▼搴?http://www.shnenglu.com/jrckkyy/archive/2009/12/10/102941.html瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/dc:creator>瀛﹁呯珯鍦ㄥ法浜虹殑鑲╄唨涓?/author>Thu, 10 Dec 2009 14:53:00 GMThttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102941.htmlhttp://www.shnenglu.com/jrckkyy/comments/102941.htmlhttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102941.html#Feedback0http://www.shnenglu.com/jrckkyy/comments/commentRss/102941.htmlhttp://www.shnenglu.com/jrckkyy/services/trackbacks/102941.html鏈夊墠闈㈡敞閲婃垜浠彲浠ョ煡閬撴煡璇㈠叧閿瓧鍜屽瓧鍏告枃浠跺噯澶囧ソ濂藉悗錛屽皢榪涘叆鐢ㄦ埛鍏抽敭瀛楀垎璇嶉樁孌?/p>

//TSESearch.cpp涓細

view plaincopy to clipboardprint?
CHzSeg iHzSeg;      //include ChSeg/HzSeg.h  
 
//  
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery);  //灝唃et鍒扮殑鏌ヨ鍙橀噺鍒嗚瘝鍒嗘垚 "鎴?        鐖?      浣犱滑/ 鐨?      鏍煎紡"  
 
vector<STRING></STRING> vecTerm;  
iQuery.ParseQuery(vecTerm);     //灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑  
 
set<STRING></STRING> setRelevantRst;   
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);   
 
gettimeofday(&end_tv,&tz);  
// search end  
//鎼滅儲瀹屾瘯 

 CHzSeg iHzSeg;  //include ChSeg/HzSeg.h

 //
 iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //灝唃et鍒扮殑鏌ヨ鍙橀噺鍒嗚瘝鍒嗘垚 "鎴?  鐖?  浣犱滑/ 鐨?  鏍煎紡"
 
 vector vecTerm;
 iQuery.ParseQuery(vecTerm);  //灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑
 
 set setRelevantRst;
 iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
 
 gettimeofday(&end_tv,&tz);
 // search end
 //鎼滅儲瀹屾瘯view plaincopy to clipboardprint?
鐪婥HzSeg 涓殑榪欎釜鏂規硶 

鐪婥HzSeg 涓殑榪欎釜鏂規硶view plaincopy to clipboardprint?
//ChSeg/HzSeg.h 

//ChSeg/HzSeg.hview plaincopy to clipboardprint?
/**  
 * 紼嬪簭緲昏瘧璇存槑  
 * 榪涗竴姝ュ噣鍖栨暟鎹紝杞崲姹夊瓧  
 * @access  public  
 * @param   CDict, string 鍙傛暟鐨勬眽瀛楄鏄?瀛楀吀錛屾煡璇㈠瓧絎︿覆  
 * @return  string 0  
 */  
// process a sentence before segmentation  
//鍦ㄥ垎璇嶅墠澶勭悊鍙ュ瓙  
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const  
{  
    string s2="";  
    unsigned int i,len;  
 
    while (!s1.empty())   
    {  
        unsigned char ch=(unsigned char) s1[0];  
        if(ch<128)   
        { // deal with ASCII  
            i=1;  
            len = s1.size();  
            while (i<LEN len="s1.length();" i="0;" 涓枃鏍囩偣絳夐潪姹夊瓧瀛楃="" if="" else="" yhf="" s1="s1.substr(i);" by="" added="" ch="=13)" s2="" cr=""></LEN>=161)  
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))  
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))  
              && (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)   
              || (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186  
              || (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))   
                {   
                    ii=i+2; // 鍋囧畾娌℃湁鍗婁釜姹夊瓧  
                }  
 
                if (i==0) ii=i+2;  
 
                // 涓嶅鐞嗕腑鏂囩┖鏍?nbsp; 
                if (!(ch==161 && (unsigned char)s1[1]==161))   
                {   
                    if (i <= s1.size())  // yhf  
                        // 鍏朵粬鐨勯潪姹夊瓧鍙屽瓧鑺傚瓧絎﹀彲鑳借繛緇緭鍑?nbsp; 
                        s2 += s1.substr(0, i) + SEPARATOR;   
                    else break; // yhf  
                }  
 
                if (i <= s1.size())  // yhf  
                    s1s1=s1.substr(i);  
                else break;     //yhf  
 
                continue;  
            }  
        }  
      
 
    // 浠ヤ笅澶勭悊姹夊瓧涓?nbsp; 
 
        i = 2;  
        len = s1.length();  
 
        while(i<LEN></LEN>=176)   
//    while(i<LEN></LEN>=128 && (unsigned char)s1[i]!=161)  
            i+=2;  
 
        s2+=SegmentHzStrMM(dict, s1.substr(0,i));  
 
        if (i <= len)    // yhf  
            s1s1=s1.substr(i);  
        else break; // yhf  
    }  
 
    return s2;  

/**
 * 紼嬪簭緲昏瘧璇存槑
 * 榪涗竴姝ュ噣鍖栨暟鎹紝杞崲姹夊瓧
 * @access  public
 * @param   CDict, string 鍙傛暟鐨勬眽瀛楄鏄?瀛楀吀錛屾煡璇㈠瓧絎︿覆
 * @return  string 0
 */
// process a sentence before segmentation
//鍦ㄥ垎璇嶅墠澶勭悊鍙ュ瓙
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
{
 string s2="";
 unsigned int i,len;

 while (!s1.empty())
 {
  unsigned char ch=(unsigned char) s1[0];
  if(ch<128)
  { // deal with ASCII
   i=1;
   len = s1.size();
   while (i=161)
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
              && (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
              || (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
              || (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
    {
     i=i+2; // 鍋囧畾娌℃湁鍗婁釜姹夊瓧
    }

    if (i==0) i=i+2;

    // 涓嶅鐞嗕腑鏂囩┖鏍?br>    if (!(ch==161 && (unsigned char)s1[1]==161))
    {
     if (i <= s1.size()) // yhf
      // 鍏朵粬鐨勯潪姹夊瓧鍙屽瓧鑺傚瓧絎﹀彲鑳借繛緇緭鍑?br>      s2 += s1.substr(0, i) + SEPARATOR;
     else break; // yhf
    }

    if (i <= s1.size()) // yhf
     s1=s1.substr(i);
    else break;  //yhf

    continue;
   }
  }
   

    // 浠ヤ笅澶勭悊姹夊瓧涓?/p>

  i = 2;
  len = s1.length();

  while(i=176)
//    while(i=128 && (unsigned char)s1[i]!=161)
   i+=2;

  s2+=SegmentHzStrMM(dict, s1.substr(0,i));

  if (i <= len) // yhf
   s1=s1.substr(i);
  else break; // yhf
 }

 return s2;
}view plaincopy to clipboardprint?
  

 view plaincopy to clipboardprint?
//Query.cpp 

//Query.cppview plaincopy to clipboardprint?
<PRE class=csharp name="code">/**  
 * 紼嬪簭緲昏瘧璇存槑  
 * 灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑  
 *  
 * @access  public  
 * @param   vector<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細鍚戦噺瀹瑰櫒  
 * @return  void  
 */  
void CQuery::ParseQuery(vector<STRING></STRING> &vecTerm)  
{  
    string::size_type idx;   
    while ( (idx = m_sSegQuery.find("/  ")) != string::npos ) {   
        vecTerm.push_back(m_sSegQuery.substr(0,idx));   
        m_sSegQuerym_sSegQuery = m_sSegQuery.substr(idx+3);   
    }  
}  
</PRE> 
<PRE class=csharp name="code"> </PRE> 
<PRE class=csharp name="code"><PRE class=csharp name="code">/**  
 * 紼嬪簭緲昏瘧璇存槑  
 * 鐩稿叧鎬у垎鏋愭煡璇紝鏋勯犵粨鏋滈泦鍚坰etRelevantRst //鐡墮鎵鍦?nbsp; 
 *  
 * @access  public  
 * @param   vector<STRING></STRING> map set<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細 鐢ㄦ埛鎻愪氦鍏抽敭瀛楃殑鍒嗚瘝緇勶紝鍊掓帓绱㈠紩鏄犲皠錛岀浉鍏蟲х粨鏋滈泦鍚?nbsp; 
 * @return  string 0  
 */  
bool CQuery::GetRelevantRst  
(  
    vector<STRING></STRING> &vecTerm,   
    map &mapBuckets,   
    set<STRING></STRING> &setRelevantRst  
) const  
{  
    set<STRING></STRING> setSRst;  
 
    bool bFirst=true;  
    vector<STRING></STRING>::iterator itTerm = vecTerm.begin();  
 
    for ( ; itTerm != vecTerm.end(); ++itTerm )  
    {  
 
        setSRst.clear();  
        copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));  
 
        map mapRstDoc;  
        string docid;  
        int doccnt;  
 
        map::iterator itBuckets = mapBuckets.find(*itTerm);  
        if (itBuckets != mapBuckets.end())  
        {  
            string strBucket = (*itBuckets).second;  
            string::size_type idx;  
            idx = strBucket.find_first_not_of(" ");  
            strBucketstrBucket = strBucket.substr(idx);  
 
            while ( (idx = strBucket.find(" ")) != string::npos )   
            {  
                docid = strBucket.substr(0,idx);  
                doccnt = 0;  
 
                if (docid.empty()) continue;  
 
                map::iterator it = mapRstDoc.find(docid);  
                if ( it != mapRstDoc.end() )  
                {  
                    doccnt = (*it).second + 1;  
                    mapRstDoc.erase(it);  
                }  
                mapRstDoc.insert( pair(docid,doccnt) );  
 
                strBucketstrBucket = strBucket.substr(idx+1);  
            }  
 
            // remember the last one  
            docid = strBucket;  
            doccnt = 0;  
            map::iterator it = mapRstDoc.find(docid);  
            if ( it != mapRstDoc.end() )  
            {  
                doccnt = (*it).second + 1;  
                mapRstDoc.erase(it);  
            }  
            mapRstDoc.insert( pair(docid,doccnt) );  
        }  
 
        // sort by term frequencty  
        multimap > newRstDoc;  
        map::iterator it0 = mapRstDoc.begin();  
        for ( ; it0 != mapRstDoc.end(); ++it0 ){  
            newRstDoc.insert( pair((*it0).second,(*it0).first) );  
        }  
 
        multimap::iterator itNewRstDoc = newRstDoc.begin();  
        setRelevantRst.clear();  
        for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){  
            string docid = (*itNewRstDoc).second;  
 
            if (bFirst==true) {  
                setRelevantRst.insert(docid);  
                continue;  
            }  
 
            if ( setSRst.find(docid) != setSRst.end() ){      
                setRelevantRst.insert(docid);  
            }  
        }  
 
        //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";  
        bFirst = false;  
    }  
    return true;  
}</PRE> 
</PRE> 
鎺ヤ笅鏉ョ殑灝辨槸鐜板疄浜嗭紝鍓嶉潰閮藉彧鏄鐞嗘暟鎹緱鍒?setRelevantRst 榪欎釜鏌ヨ緇撴瀯闆嗗悎,榪欓噷灝變笉澶氳浜嗕笅闈㈠氨鍜宲hp涔嬬被鐨勮剼鏈璦宸笉澶氾紝鏍煎紡鍖栫粨鏋滈泦鍚堝茍鏄劇ず鍑烘潵銆?nbsp;

view plaincopy to clipboardprint?/**   * 紼嬪簭緲昏瘧璇存槑   * 灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑   *   * @access  public   * @param   vector<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細鍚戦噺瀹瑰櫒   * @return  void   */  void CQuery::ParseQuery(vector<STRING></STRING> &vecTerm)   {       string::size_type idx;        while ( (idx = m_sSegQuery.find("/  ")) != string::npos ) {            vecTerm.push_back(m_sSegQuery.substr(0,idx));            m_sSegQuery = m_sSegQuery.substr(idx+3);        }   }  /**
 * 紼嬪簭緲昏瘧璇存槑
 * 灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑
 *
 * @access  public
 * @param   vector 鍙傛暟鐨勬眽瀛楄鏄庯細鍚戦噺瀹瑰櫒
 * @return  void
 */
void CQuery::ParseQuery(vector &vecTerm)
{
 string::size_type idx;
 while ( (idx = m_sSegQuery.find("/  ")) != string::npos ) {
  vecTerm.push_back(m_sSegQuery.substr(0,idx));
  m_sSegQuery = m_sSegQuery.substr(idx+3);
 }
}

view plaincopy to clipboardprint?   
view plaincopy to clipboardprint?<PRE class=csharp name="code">/**   * 紼嬪簭緲昏瘧璇存槑   * 鐩稿叧鎬у垎鏋愭煡璇紝鏋勯犵粨鏋滈泦鍚坰etRelevantRst //鐡墮鎵鍦?nbsp;  *   * @access  public   * @param   vector<STRING></STRING> map set<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細 鐢ㄦ埛鎻愪氦鍏抽敭瀛楃殑鍒嗚瘝緇勶紝鍊掓帓绱㈠紩鏄犲皠錛岀浉鍏蟲х粨鏋滈泦鍚?nbsp;  * @return  string 0   */  bool CQuery::GetRelevantRst   (       vector<STRING></STRING> &vecTerm,        map &mapBuckets,        set<STRING></STRING> &setRelevantRst   ) const  {       set<STRING></STRING> setSRst;         bool bFirst=true;       vector<STRING></STRING>::iterator itTerm = vecTerm.begin();         for ( ; itTerm != vecTerm.end(); ++itTerm )       {             setSRst.clear();           copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));             map mapRstDoc;           string docid;           int doccnt;             map::iterator itBuckets = mapBuckets.find(*itTerm);           if (itBuckets != mapBuckets.end())           {               string strBucket = (*itBuckets).second;               string::size_type idx;               idx = strBucket.find_first_not_of(" ");               strBucket = strBucket.substr(idx);                 while ( (idx = strBucket.find(" ")) != string::npos )                {                   docid = strBucket.substr(0,idx);                   doccnt = 0;                     if (docid.empty()) continue;                     map::iterator it = mapRstDoc.find(docid);                   if ( it != mapRstDoc.end() )                   {                       doccnt = (*it).second + 1;                       mapRstDoc.erase(it);                   }                   mapRstDoc.insert( pair(docid,doccnt) );                     strBucket = strBucket.substr(idx+1);               }                 // remember the last one               docid = strBucket;               doccnt = 0;               map::iterator it = mapRstDoc.find(docid);               if ( it != mapRstDoc.end() )               {                   doccnt = (*it).second + 1;                   mapRstDoc.erase(it);               }               mapRstDoc.insert( pair(docid,doccnt) );           }             // sort by term frequencty           multimap > newRstDoc;           map::iterator it0 = mapRstDoc.begin();           for ( ; it0 != mapRstDoc.end(); ++it0 ){               newRstDoc.insert( pair((*it0).second,(*it0).first) );           }             multimap::iterator itNewRstDoc = newRstDoc.begin();           setRelevantRst.clear();           for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){               string docid = (*itNewRstDoc).second;                 if (bFirst==true) {                   setRelevantRst.insert(docid);                   continue;               }                 if ( setSRst.find(docid) != setSRst.end() ){                       setRelevantRst.insert(docid);               }           }             //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";           bFirst = false;       }       return true;   }</PRE>  view plaincopy to clipboardprint?/**   * 紼嬪簭緲昏瘧璇存槑   * 鐩稿叧鎬у垎鏋愭煡璇紝鏋勯犵粨鏋滈泦鍚坰etRelevantRst //鐡墮鎵鍦?nbsp;  *   * @access  public   * @param   vector<STRING></STRING> map set<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細 鐢ㄦ埛鎻愪氦鍏抽敭瀛楃殑鍒嗚瘝緇勶紝鍊掓帓绱㈠紩鏄犲皠錛岀浉鍏蟲х粨鏋滈泦鍚?nbsp;  * @return  string 0   */  bool CQuery::GetRelevantRst   (       vector<STRING></STRING> &vecTerm,        map &mapBuckets,        set<STRING></STRING> &setRelevantRst   ) const  {       set<STRING></STRING> setSRst;         bool bFirst=true;       vector<STRING></STRING>::iterator itTerm = vecTerm.begin();         for ( ; itTerm != vecTerm.end(); ++itTerm )       {             setSRst.clear();           copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));             map mapRstDoc;           string docid;           int doccnt;             map::iterator itBuckets = mapBuckets.find(*itTerm);           if (itBuckets != mapBuckets.end())           {               string strBucket = (*itBuckets).second;               string::size_type idx;               idx = strBucket.find_first_not_of(" ");               strBucket = strBucket.substr(idx);                 while ( (idx = strBucket.find(" ")) != string::npos )                {                   docid = strBucket.substr(0,idx);                   doccnt = 0;                     if (docid.empty()) continue;                     map::iterator it = mapRstDoc.find(docid);                   if ( it != mapRstDoc.end() )                   {                       doccnt = (*it).second + 1;                       mapRstDoc.erase(it);                   }                   mapRstDoc.insert( pair(docid,doccnt) );                     strBucket = strBucket.substr(idx+1);               }                 // remember the last one               docid = strBucket;               doccnt = 0;               map::iterator it = mapRstDoc.find(docid);               if ( it != mapRstDoc.end() )               {                   doccnt = (*it).second + 1;                   mapRstDoc.erase(it);               }               mapRstDoc.insert( pair(docid,doccnt) );           }             // sort by term frequencty           multimap > newRstDoc;           map::iterator it0 = mapRstDoc.begin();           for ( ; it0 != mapRstDoc.end(); ++it0 ){               newRstDoc.insert( pair((*it0).second,(*it0).first) );           }             multimap::iterator itNewRstDoc = newRstDoc.begin();           setRelevantRst.clear();           for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){               string docid = (*itNewRstDoc).second;                 if (bFirst==true) {                   setRelevantRst.insert(docid);                   continue;               }                 if ( setSRst.find(docid) != setSRst.end() ){                       setRelevantRst.insert(docid);               }           }             //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";           bFirst = false;       }       return true;   }  /**
 * 紼嬪簭緲昏瘧璇存槑
 * 鐩稿叧鎬у垎鏋愭煡璇紝鏋勯犵粨鏋滈泦鍚坰etRelevantRst //鐡墮鎵鍦?br> *
 * @access  public
 * @param   vector map set 鍙傛暟鐨勬眽瀛楄鏄庯細 鐢ㄦ埛鎻愪氦鍏抽敭瀛楃殑鍒嗚瘝緇勶紝鍊掓帓绱㈠紩鏄犲皠錛岀浉鍏蟲х粨鏋滈泦鍚?br> * @return  string 0
 */
bool CQuery::GetRelevantRst
(
 vector &vecTerm,
 map &mapBuckets,
 set &setRelevantRst
) const
{
 set setSRst;

 bool bFirst=true;
 vector::iterator itTerm = vecTerm.begin();

 for ( ; itTerm != vecTerm.end(); ++itTerm )
 {

  setSRst.clear();
  copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));

  map mapRstDoc;
  string docid;
  int doccnt;

  map::iterator itBuckets = mapBuckets.find(*itTerm);
  if (itBuckets != mapBuckets.end())
  {
   string strBucket = (*itBuckets).second;
   string::size_type idx;
   idx = strBucket.find_first_not_of(" ");
   strBucket = strBucket.substr(idx);

   while ( (idx = strBucket.find(" ")) != string::npos )
   {
    docid = strBucket.substr(0,idx);
    doccnt = 0;

    if (docid.empty()) continue;

    map::iterator it = mapRstDoc.find(docid);
    if ( it != mapRstDoc.end() )
    {
     doccnt = (*it).second + 1;
     mapRstDoc.erase(it);
    }
    mapRstDoc.insert( pair(docid,doccnt) );

    strBucket = strBucket.substr(idx+1);
   }

   // remember the last one
   docid = strBucket;
   doccnt = 0;
   map::iterator it = mapRstDoc.find(docid);
   if ( it != mapRstDoc.end() )
   {
    doccnt = (*it).second + 1;
    mapRstDoc.erase(it);
   }
   mapRstDoc.insert( pair(docid,doccnt) );
  }

  // sort by term frequencty
  multimap > newRstDoc;
  map::iterator it0 = mapRstDoc.begin();
  for ( ; it0 != mapRstDoc.end(); ++it0 ){
   newRstDoc.insert( pair((*it0).second,(*it0).first) );
  }

  multimap::iterator itNewRstDoc = newRstDoc.begin();
  setRelevantRst.clear();
  for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){
   string docid = (*itNewRstDoc).second;

   if (bFirst==true) {
    setRelevantRst.insert(docid);
    continue;
   }

   if ( setSRst.find(docid) != setSRst.end() ){ 
    setRelevantRst.insert(docid);
   }
  }

  //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "";
  bFirst = false;
 }
 return true;
}

鎺ヤ笅鏉ョ殑灝辨槸鐜板疄浜嗭紝鍓嶉潰閮藉彧鏄鐞嗘暟鎹緱鍒?setRelevantRst 榪欎釜鏌ヨ緇撴瀯闆嗗悎,榪欓噷灝變笉澶氳浜嗕笅闈㈠氨鍜宲hp涔嬬被鐨勮剼鏈璦宸笉澶氾紝鏍煎紡鍖栫粨鏋滈泦鍚堝茍鏄劇ず鍑烘潵銆?br>//TSESearch.cpp

view plaincopy to clipboardprint?
//涓嬮潰寮濮嬫樉紺?nbsp; 
    CDisplayRst iDisplayRst;   
    iDisplayRst.ShowTop();   
 
    float used_msec = (end_tv.tv_sec-begin_tv.tv_sec)*1000   
        +((float)(end_tv.tv_usec-begin_tv.tv_usec))/(float)1000;   
 
    iDisplayRst.ShowMiddle(iQuery.m_sQuery,used_msec,   
            setRelevantRst.size(), iQuery.m_iStart);  
 
    iDisplayRst.ShowBelow(vecTerm,setRelevantRst,vecDocIdx,iQuery.m_iStart);

 



]]>
久久露脸国产精品| 麻豆亚洲AV永久无码精品久久| 久久国产香蕉一区精品| 性做久久久久久久久| 久久九九兔免费精品6| 久久亚洲国产中v天仙www| 四虎久久影院| 天天综合久久久网| 99久久精品国产一区二区| 99久久国产综合精品成人影院| 亚洲国产成人久久综合一区77| 九九久久自然熟的香蕉图片| 久久精品亚洲精品国产欧美| 久久久久国产精品熟女影院| 日韩久久久久中文字幕人妻 | 青青国产成人久久91网| 7777精品伊人久久久大香线蕉| 热久久这里只有精品| 久久精品人人槡人妻人人玩AV| 香蕉久久久久久狠狠色| 久久精品无码一区二区app| 韩国无遮挡三级久久| 久久久噜噜噜www成人网| 亚洲欧美一级久久精品| 久久国产免费| 国产精品免费久久久久久久久 | 久久www免费人成看国产片| 韩国无遮挡三级久久| 日本人妻丰满熟妇久久久久久| 亚洲人成无码www久久久| 国产精品亚洲美女久久久| aaa级精品久久久国产片| 亚洲午夜久久久久久久久电影网| 久久久久久久91精品免费观看| 国产福利电影一区二区三区,免费久久久久久久精 | 久久天天躁狠狠躁夜夜2020| 青青草原1769久久免费播放| 99久久精品免费看国产免费| 国产精品免费久久久久影院| 久久久青草青青国产亚洲免观| 久久亚洲电影|