锘??xml version="1.0" encoding="utf-8" standalone="yes"?> 鐜板湪閬囧埌涓涓渶姹傛槸鍙戠幇鎶ヨ鏃跺疄鏃跺彂閫佹秷鎭粰鐩稿叧浜哄憳錛岀敱浜庡叕鍙哥煭淇$綉鍏沖彧涔頒簡涓婃搗鐢典俊鐢ㄦ埛娌℃湁涓婃搗鐢典俊鐨勫彿鐮侊紝姹椾竴涓紝鍙ソ閫氳繃鍙戦偖浠舵潵瀹炴柦銆?/p>
鏀寔鍙戦丟B18030緙栫爜鐨勬枃鏈唴瀹癸紝浠繪剰緙栫爜闄勪歡錛屽彲浠ュ仛鍑洪傚綋淇敼鏀寔緹ゅ彂銆?/p>
windows xp涓嬶細 linux ubuntu錛宻use涓嬶細 鏀跺埌鐨勭粨鏋滐細 鐙珛鐨勭洃鎺ц剼鏈槸榪斿洖涓涓垪琛ㄥ祵濂楀厓緇勭殑鏁版嵁緇撴瀯錛屾渶鍚庡啀姹囨繪垚涓涓畬鏁寸殑XML鏁版嵁宀涳紝涓轟簡璋冭瘯鏂逛究鑴氭湰鐨勬瘡涓涓腑闂寸粨鏋滈兘瀵煎嚭鍒頒竴涓復鏃舵枃鏈腑銆?/p>
榪愯浠ヤ笅鑴氭湰瑕佺‘瀹氫綘鐨刲inux瑁呬簡ethtool宸ュ叿錛屽湪ubuntu2.6.27-7-server錛寀buntu22.6.27.19-5-default錛宻use 2.6.27.19-5-default 嫻嬭瘯閫氳繃銆?/p>
浠g爜錛?/p>
浣跨敤渚嬪瓙錛?/p>
姣忎竴涓垪琛ㄥ厓绱犲厓緇勯噷闈㈢浜屼釜鍏冪礌絎竴涓瓧孌典負緗戦?Bytes/S錛屼緥濡俥th1緗戝崱鐨勭綉閫熷氨鏄?.3KB/s錛宔th0緗戦熸槸2.9KB/s錛屼粖澶╂槸鍛ㄥ叚榪欎釜嫻侀噺寰堟甯?/p>
int main(int argc, char* argv[]) //./CrtInvertedIdx moon.fidx.sort > sun.iidx string strLine,strDocNum,tmp1=""; if (tmp.size()<2 || tmp.size() > 8) continue; if (tmp1.empty()) tmp1=tmp; if (tmp == tmp1) cout << tmp1 << "\t" << strDocNum << endl; cnt++; return 0; 璇︾粏鐨勬枃浠跺姛鑳藉拰浠嬬粛閮藉湪榪欓噷鏈変簡浠嬬粛鑷《鍚戜笅瀛︽悳绱㈠紩鎿庘斺斿寳澶уぉ緗戞悳绱㈠紩鎿嶵SE鍒嗘瀽鍙婂畬鍏ㄦ敞閲奫5]鍊掓帓绱㈠紩鐨勫緩绔嬪強鏂囦歡浠嬬粛 CrtForwardIdx.cpp鏂囦歡 int main(int argc, char* argv[]) //./CrtForwardIdx Tianwang.raw.***.seg > moon.fidx string strLine,strDocNum; cnt++; while ( (idx = strLine.find(SEPARATOR)) != string::npos ) //鎸囧畾鏌ユ壘鍒嗙晫絎? //if (cnt==100) break; return 0; author:http://hi.baidu.com/jrckkyy author:http://blog.csdn.net/jrckkyy 榪欓噷杈撳叆 Tianwang.raw.*****錛孌oc.idx錛孶rl.idx.sort_uniq絳変笁涓枃浠訛紝杈撳嚭涓涓猅ianwang.raw.***.seg 鍒嗚瘝瀹屾瘯鐨勬枃浠?/p>
int main(int argc, char* argv[]) //ifstream ifs("Tianwang.raw.2559638448"); ifstream ifsUrl("Url.idx.sort_uniq"); //鎺掑簭騫舵秷閲嶅悗鐨剈rl瀛楀吀 while (getline(ifsUrl,strLine)) //鍋忕url瀛楀吀瀛樺叆涓涓悜閲忓唴瀛樹腑 memset(chksum, 0, 33); while (getline(ifsDoc,strLine)) //鍋忕瀛楀吀鏂囦歡灝嗗叾鏀懼叆涓涓悜閲忓唴瀛樹腑 memset(chksum, 0, 33); strFileName += ".seg"; // find document according to docId char *s; // skip Head //iDocument.m_sBody = s; delete[] pContent; CStrFun::ReplaceStr(strLine, " ", " "); return(0);
]]>
{
ifstream ifsImgInfo(argv[1]);
if (!ifsImgInfo)
{
cerr << "Cannot open " << argv[1] << " for input\n";
return -1;
}
int cnt = 0;
while (getline(ifsImgInfo, strLine))
{
string::size_type idx;
string tmp;
idx = strLine.find("\t");
tmp = strLine.substr(0,idx);
{
strDocNum = strDocNum + " " + strLine.substr(idx+1);
}
else
{
if ( strDocNum.empty() )
strDocNum = strDocNum + " " + strLine.substr(idx+1);
tmp1 = tmp;
strDocNum.clear();
strDocNum = strDocNum + " " + strLine.substr(idx+1);
}
//if (cnt==100) break;
}
cout << tmp1 << "\t" << strDocNum << endl; //鍊掓帓绱㈠紩涓瘡涓瓧鍏稿崟璇嶅悗鐨勬枃妗g紪鍙蜂互table閿負闂撮殧
}
]]>
{
ifstream ifsImgInfo(argv[1]);
if (!ifsImgInfo)
{
cerr << "Cannot open " << argv[1] << " for input\n";
return -1;
}
int cnt = 0;
while (getline(ifsImgInfo, strLine))
{
string::size_type idx;
if (cnt%2 == 1) //濂囨暟琛屼負鏂囨。緙栧彿
{
strDocNum = strLine.substr(0,strLine.size());
continue;
}
if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
{
continue;
}
{
string tmp1 = strLine.substr(0,idx);
cout << tmp1 << "\t" << strDocNum << endl;
strLine = strLine.substr(idx + SEPARATOR.size());
}
}
}
]]>
{
string strLine, strFileName=argv[1];
CUrl iUrl;
vector<CUrl> vecCUrl;
CDocument iDocument;
vector<CDocument> vecCDocument;
unsigned int docId = 0;
ifstream ifs(strFileName.c_str()); //DocSegment Tianwang.raw.****
if (!ifs)
{
cerr << "Cannot open tianwang.img.info for input\n";
return -1;
}
if (!ifsUrl)
{
cerr << "Cannot open Url.idx.sort_uniq for input\n";
return -1;
}
ifstream ifsDoc("Doc.idx"); //瀛楀吀鏂囦歡
if (!ifsDoc)
{
cerr << "Cannot open Doc.idx for input\n";
return -1;
}
{
char chksum[33];
int docid;
sscanf( strLine.c_str(), "%s%d", chksum, &docid );
iUrl.m_sChecksum = chksum;
iUrl.m_nDocId = docid;
vecCUrl.push_back(iUrl);
}
{
int docid,pos,length;
char chksum[33];
sscanf( strLine.c_str(), "%d%d%d%s", &docid, &pos, &length,chksum );
iDocument.m_nDocId = docid;
iDocument.m_nPos = pos;
iDocument.m_nLength = length;
iDocument.m_sChecksum = chksum;
vecCDocument.push_back(iDocument);
}
ofstream fout(strFileName.c_str(), ios::in|ios::out|ios::trunc|ios::binary); //璁劇疆瀹屾垚鍒嗚瘝鍚庣殑鏁版嵁杈撳嚭鏂囦歡
for ( docId=0; docId<MAX_DOC_ID; docId++ )
{
int length = vecCDocument[docId+1].m_nPos - vecCDocument[docId].m_nPos -1;
char *pContent = new char[length+1];
memset(pContent, 0, length+1);
ifs.seekg(vecCDocument[docId].m_nPos);
ifs.read(pContent, length);
s = pContent;
int bytesRead = 0,newlines = 0;
while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1)
{
if (*s == '\n')
newlines++;
else
newlines = 0;
s++;
bytesRead++;
}
if (bytesRead == HEADER_BUF_SIZE-1) continue;
// skip header
bytesRead = 0,newlines = 0;
while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1)
{
if (*s == '\n')
newlines++;
else
newlines = 0;
s++;
bytesRead++;
}
if (bytesRead == HEADER_BUF_SIZE-1) continue;
iDocument.RemoveTags(s); //鍘婚櫎<>
iDocument.m_sBodyNoTags = s;
string strLine = iDocument.m_sBodyNoTags;
CStrFun::EmptyStr(strLine); // set " \t\r\n" to " "
// segment the document 鍏蜂綋鍒嗚瘝澶勭悊
CHzSeg iHzSeg;
strLine = iHzSeg.SegmentSentenceMM(iDict,strLine);
fout << docId << endl << strLine;
fout << endl;
}
}
榪欓噷鍙槸嫻厜鎺犲獎寮忕殑榪囦竴閬嶅ぇ姒傜殑浠g爜錛屽悗闈㈡垜浼氭湁涓撻璇︾粏璁茶В parse html 鍜?segment docment 絳夋妧鏈?/p>
]]>
author:http://blog.csdn.net/jrckkyy
涓婁竴綃囦富瑕佷粙緇嶄簡鍊掓帓绱㈠紩寤虹珛鐩稿叧鐨勬枃浠跺強涓棿鏂囦歡銆?br>TSE寤虹珛绱㈠紩鍦ㄨ繍琛岀▼搴忎笂鐨勫ぇ鑷存楠ゅ彲浠ョ畝鍖栧垎涓轟互涓嬪嚑姝ワ細
1銆佽繍琛屽懡浠?./DocIndex
浼氱敤鍒頒竴涓枃浠?tianwang.raw.520 //鐖彇鍥炴潵鐨勫師濮嬫枃浠訛紝鍖呭惈澶氫釜緗戦〉鐨勬墍鏈変俊鎭紝鎵浠ュ緢澶э紝榪欎篃鏄竴涓湁寰呰В鍐崇殑闂錛屽埌搴曞瓨鎴愬ぇ鏂囦歡錛堝鏋滆繃澶т細瓚呰繃2G鎴?G鐨勯檺鍒訛紝鑰屼笖鏂囦歡榪囧ぇ绱㈠紩鏁堢巼榪囦綆錛夎繕鏄皬鏂囦歡錛堟枃浠舵暟榪囧鐢ㄤ簬鎵撳紑鍏抽棴鏂囦歡鍙ユ焺鐨勬秷鑰楄繃澶э級榪樻湁寰呮濊冿紝榪樺氨鏄瓨鍌ㄦ柟妗堢殑瑙e喅鏈緇堣偗瀹氭槸瑕佸瓨涓哄垎甯冨紡鐨勶紝鏈緇堟繪枃浠墮噺鑲畾鏄細涓奣B鐨勶紝TSE鍙敮鎸佸皬鍨嬬殑鎼滅儲寮曟搸闇姹傘?nbsp;
浼氫駭鐢熶竴涓嬩笁涓枃浠?Doc.idx, Url.idx, DocId2Url.idx //Data鏂囦歡澶逛腑鐨凞oc.idx DocId2Url.idx鍜孌oc.idx
2銆佽繍琛屽懡浠?sort Url.idx|uniq > Url.idx.sort_uniq //Data鏂囦歡澶逛腑鐨刄rl.idx.sort_uniq
浼氱敤鍒頒竴涓枃浠?Url.idx鏂囦歡 //md5 hash 涔嬪悗鐨剈rl瀹屾暣鍦板潃鍜宒ocument id鍊煎
浼氫駭鐢熶竴涓枃浠?Url.idx.sort_uniq //URL娑堥噸錛宮d5 hash鎺掑簭錛屾彁楂樻绱㈡晥鐜?/p>
3銆佽繍琛屽懡浠?./DocSegment Tianwang.raw.2559638448
浼氱敤鍒頒竴涓枃浠?Tianwang.raw.2559638448 //Tianwang.raw.2559638448涓虹埇鍥炴潵鐨勬枃浠?錛屾瘡涓〉闈㈠寘鍚玥ttp澶達紝鍒嗚瘝涓哄悗闈㈠緩绔嬪埌鎺掔儲寮曞仛鍑嗗
浼氫駭鐢熶竴涓枃浠?Tianwang.raw.2559638448.seg //鍒嗚瘝鏂囦歡錛岀敱涓琛宒ocument id鍙峰拰涓琛屾枃妗e垎璇嶇粍錛堝彧瀵規瘡涓枃妗?lt;html></html>涓?lt;head></head><body></body>絳夋枃瀛楁爣璁頒腑鐨勬枃鏈繘琛屽垎緇勶級鏋勬垚
4銆佽繍琛屽懡浠?./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx //寤虹珛鐙珛鐨勬鍚戠儲寮?/p>
5銆佽繍琛屽懡浠?br>#set | grep "LANG"
#LANG=en; export LANG;
#sort moon.fidx > moon.fidx.sort
6銆佽繍琛屽懡浠?./CrtInvertedIdx moon.fidx.sort > sun.iidx //寤虹珛鍊掓帓绱㈠紩
鎴戜滑鍏堜粠寤虹珛绱㈠紩鐨勭涓涓▼搴廌ocIndex.cpp寮濮嬪垎鏋愩?娉ㄩ噴綰﹀畾錛歍ianwang.raw.2559638448鏄姄鍥炴潵鍚堝茍鎴愮殑澶ф枃浠訛紝鍚庨潰灝卞彨澶ф枃浠訛紝閲岄潰鍖呭惈浜嗗緢澶氱瘒html鏂囨。錛岄噷闈㈢殑鏂囨。鏈夎寰嬬殑鍒嗛殧灝卞彨鍋氫竴綃囦竴綃囩殑鏂囨。)
//DocIndex.h start-------------------------------------------------------------
#ifndef _COMM_H_040708_
#define _COMM_H_040708_
#include
#include
#include
#include
#include
#include
#include
#include
using namespace std;
const unsigned HEADER_BUF_SIZE = 1024;
const unsigned RstPerPage = 20; //鍓嶅彴鎼滅儲緇撴灉鏁版嵁闆嗚繑鍥炴潯鏁?/p>
//iceway
//const unsigned MAX_DOC_IDX_ID = 21312; //DocSegment.cpp涓鐢ㄥ埌
const unsigned MAX_DOC_IDX_ID = 22104;
//const string IMG_INFO_NAME("./Data/s1.1");
const string INF_INFO_NAME("./Data/sun.iidx"); //鍊掓帓绱㈠紩鏂囦歡
//鏈卞痙 14383 16151 16151 16151 1683 207 6302 7889 8218 8218 8637
//鏈卞彜鍔?nbsp; 1085 1222
//9涓囧鏉?瀛楀厓鏂囦歡 鍖呮嫭鐗規畩絎﹀彿錛屾爣鐐癸紝姹夊瓧
const string DOC_IDX_NAME("./Data/Doc.idx"); //鍊掓帓绱㈠紩鏂囦歡
const string RAWPAGE_FILE_NAME("./Data/Tianwang.swu.iceway.1.0");
//iceway
const string DOC_FILE_NAME = "Tianwang.swu.iceway.1.0"; //Docindex.cpp涓鐢ㄥ埌
const string Data_DOC_FILE_NAME = "./Data/Tianwang.swu.iceway.1.0"; //Snapshot.cpp涓鐢ㄥ埌
//const string RM_THUMBNAIL_FILES("rm -f ~/public_html/ImgSE/timg/*");
//const string THUMBNAIL_DIR("/ImgSE/timg/");
#endif _COMM_H_040708_
//DocIndex.h end--------------------------------------------------------------//DocIndex.cpp start-----------------------------------------------------------
#include
#include
#include "Md5.h"
#include "Url.h"
#include "Document.h"
//iceway(mnsc)
#include "Comm.h"
#include
using namespace std;
int main(int argc, char* argv[])
{
//ifstream ifs("Tianwang.raw.2559638448");
//ifstream ifs("Tianwang.raw.3023555472");
//iceway(mnsc)
ifstream ifs(DOC_FILE_NAME.c_str()); //鎵撳紑Tianwang.raw.3023555472鏂囦歡錛屾渶鍘熷鐨勬枃浠?br> if (!ifs)
{
cerr << "Cannot open " << "tianwang.img.info" << " for input\n";
return -1;
}
ofstream ofsUrl("Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //寤虹珛騫舵墦寮Url.idx鏂囦歡
if( !ofsUrl )
{
cout << "error open file " << endl;
}
ofstream ofsDoc("Doc.idx", ios::in|ios::out|ios::trunc|ios::binary); //寤虹珛騫舵墦寮Doc.idx鏂囦歡
if( !ofsDoc )
{
cout << "error open file " << endl;
}
ofstream ofsDocId2Url("DocId2Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //寤虹珛騫舵墦寮DocId2Url.idx鏂囦歡
if( !ofsDocId2Url )
{
cout << "error open file " << endl;
}
int cnt=0; //鏂囨。緙栧彿浠?寮濮嬭綆?br> string strLine,strPage;
CUrl iUrl;
CDocument iDocument;
CMD5 iMD5;
int nOffset = ifs.tellg();
while (getline(ifs, strLine))
{
if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
{
nOffset = ifs.tellg();
continue;
}
if (!strncmp(strLine.c_str(), "version: 1.0", 12)) //鍒ゆ柇絎竴琛屾槸鍚︽槸version: 1.0濡傛灉鏄氨瑙f瀽涓嬪幓
{
if(!getline(ifs, strLine)) break;
if (!strncmp(strLine.c_str(), "url: ", 4)) //鍒ゆ柇絎簩琛屾槸鍚︽槸url: 濡傛灉鏄垯瑙f瀽涓嬪幓
{
iUrl.m_sUrl = strLine.substr(5); //鎴彇url: 浜斾釜瀛楃涔嬪悗鐨剈rl鍐呭
iMD5.GenerateMD5( (unsigned char*)iUrl.m_sUrl.c_str(), iUrl.m_sUrl.size() ); //瀵箄rl鐢╩d5 hash澶勭悊
iUrl.m_sChecksum = iMD5.ToString(); //灝嗗瓧絎︽暟緇勭粍鍚堟垚瀛楃涓茶繖涓嚱鏁板湪Md5.h涓疄鐜?/p>
} else
{
continue;
}
while (getline(ifs, strLine))
{
if (!strncmp(strLine.c_str(), "length: ", 8)) //涓鐩磋涓嬪幓鐩村埌鍒ゆ柇婢規竟(鐩稿絎簲琛?鎯烘瑺琚瘋帒ength: 鏄垯鎺ヤ笅涓嬪幓
{
sscanf(strLine.substr(8).c_str(), "%d", &(iDocument.m_nLength)); //灝嗚鍧楁墍浠h〃緗戦〉鐨勫疄闄呯綉欏靛唴瀹歸暱搴︽斁鍏Document鏁版嵁緇撴瀯涓?br> break;
}
}
getline(ifs, strLine); //璺寵繃鐩稿絎叚琛屾晠鎰忕暀鐨勪竴涓┖琛?/p>
iDocument.m_nDocId = cnt; //灝嗘枃妗g紪鍙瘋祴鍊煎埌iDocument鏁版嵁緇撴瀯涓?br> iDocument.m_nPos = nOffset; //鏂囨。緇撳熬鍦ㄥぇ鏂囦歡涓殑緇撴潫琛屽彿
char *pContent = new char[iDocument.m_nLength+1]; //鏂板緩璇ユ枃妗i暱搴︾殑瀛楃涓叉寚閽?/p>
memset(pContent, 0, iDocument.m_nLength+1); //姣忎竴浣嶅垵濮嬪寲涓?
ifs.read(pContent, iDocument.m_nLength); //鏍規嵁鑾峰緱鐨勬枃妗i暱搴﹁鍙栨竟(鍏朵腑鍖呭惈鍗忚澶?璇誨彇鏂囨。鍐呭
iMD5.GenerateMD5( (unsigned char*)pContent, iDocument.m_nLength );
iDocument.m_sChecksum = iMD5.ToString(); //灝嗗瓧絎︽暟緇勭粍鍚堟垚瀛楃涓茶繖涓嚱鏁板湪Md5.h涓疄鐜?br>
delete[] pContent;
ofsUrl << iUrl.m_sChecksum ; //灝唌d5hash鍚庣殑url鍐欏叆Url.idx鏂囦歡
ofsUrl << "\t" << iDocument.m_nDocId << endl; //鍦ㄤ竴琛屼腑涓涓猼ab璺濈鍒嗛殧錛屽皢鏂囦歡緙栧彿鍐欏叆Url.idx鏂囦歡
ofsDoc << iDocument.m_nDocId ; //灝嗘枃浠剁紪鍙峰啓鍏oc.idx鏂囦歡
ofsDoc << "\t" << iDocument.m_nPos ; //鍦ㄤ竴琛屼腑涓涓猼ab璺濈鍒嗛殧錛屽皢璇ユ枃妗g粨鏉熻鍙鋒竟(鍚屾牱涔熸槸涓嬩竴鏂囨。寮濮嬭鍙?鍐欏叆Doc.idx鏂囦歡
//ofsDoc << "\t" << iDocument.m_nLength ;
ofsDoc << "\t" << iDocument.m_sChecksum << endl; //鍦ㄤ竴琛屼腑涓涓猼ab璺濈鍒嗛殧錛屽皢md5hash鍚庣殑url鍐欏叆Doc.idx鏂囦歡
ofsDocId2Url << iDocument.m_nDocId ; //灝嗘枃浠剁紪鍙峰啓鍏ocId2Url.idx鏂囦歡
ofsDocId2Url << "\t" << iUrl.m_sUrl << endl; //灝嗚鏂囨。鐨勫畬鏁磚rl鍐欏叆DocId2Url.idx鏂囦歡
cnt++; //鏂囨。緙栧彿鍔犱竴璇存槑璇ヤ互鏂囨。鍒嗘瀽瀹屾瘯錛岀敓鎴愪笅涓鏂囨。鐨勭紪鍙?br> }
nOffset = ifs.tellg();
}
//鏈鍚庝竴琛屽彧鏈夋枃妗e彿鍜屼笂涓綃囨枃妗g粨鏉熷彿
ofsDoc << cnt ;
ofsDoc << "\t" << nOffset << endl;
return(0);
}
//DocIndex.cpp end-----------------------------------------------------------author:http://hi.baidu.com/jrckkyy
author:http://blog.csdn.net/jrckkyy
TSE鐢ㄧ殑鏄皢鎶撳彇鍥炴潵鐨勭綉欏墊枃妗e叏閮ㄨ鍏ヤ竴涓ぇ鏂囨。錛岃鍚庡榪欎竴涓ぇ鏂囨。鍐呯殑鏁版嵁鏁翠綋緇熶竴鐨勫緩绱㈠紩錛屽叾涓寘鍚簡鍑犱釜姝ラ銆?/p>
view plaincopy to clipboardprint?
1. The document index (Doc.idx) keeps information about each document.
It is a fixed width ISAM (Index sequential access mode) index, orderd by docID.
The information stored in each entry includes a pointer into the repository,
a document length, a document checksum.
//Doc.idx 鏂囨。緙栧彿 鏂囨。闀垮害 checksum hash鐮?nbsp;
0 0 bc9ce846d7987c4534f53d423380ba70
1 76760 4f47a3cad91f7d35f4bb6b2a638420e5
2 141624 d019433008538f65329ae8e39b86026c
3 142350 5705b8f58110f9ad61b1321c52605795
//Doc.idx end
The url index (url.idx) is used to convert URLs into docIDs.
//url.idx
5c36868a9c5117eadbda747cbdb0725f 0
3272e136dd90263ee306a835c6c70d77 1
6b8601bb3bb9ab80f868d549b5c5a5f3 2
3f9eba99fa788954b5ff7f35a5db6e1f 3
//url.idx end
It is a list of URL checksums with their corresponding docIDs and is sorted by
checksum. In order to find the docID of a particular URL, the URL's checksum
is computed and a binary search is performed on the checksums file to find its
docID.
./DocIndex
got Doc.idx, Url.idx, DocId2Url.idx //Data鏂囦歡澶逛腑鐨凞oc.idx DocId2Url.idx鍜孌oc.idx涓?nbsp;
//DocId2Url.idx
0 http://*.*.edu.cn/index.aspx
1 http://*.*.edu.cn/showcontent1.jsp?NewsID=118
2 http://*.*.edu.cn/0102.html
3 http://*.*.edu.cn/0103.html
//DocId2Url.idx end
2. sort Url.idx|uniq > Url.idx.sort_uniq //Data鏂囦歡澶逛腑鐨刄rl.idx.sort_uniq
//Url.idx.sort_uniq
//瀵筯ash鍊艱繘琛屾帓搴?nbsp;
000bfdfd8b2dedd926b58ba00d40986b 1111
000c7e34b653b5135a2361c6818e48dc 1831
0019d12f438eec910a06a606f570fde8 366
0033f7c005ec776f67f496cd8bc4ae0d 2103
3. Segment document to terms, (with finding document according to the url)
./DocSegment Tianwang.raw.2559638448 //Tianwang.raw.2559638448涓虹埇鍥炴潵鐨勬枃浠?錛屾瘡涓〉闈㈠寘鍚玥ttp澶?nbsp;
got Tianwang.raw.2559638448.seg
//Tianwang.raw.2559638448 鐖彇鐨勫師濮嬬綉欏墊枃浠跺湪鏂囨。鍐呴儴姣忎竴涓枃妗d箣闂村簲璇ユ槸閫氳繃version錛?lt;/html>鍜屽洖杞﹀仛鏍囧織浣嶅垎鍓茬殑
version: 1.0
url: http://***.105.138.175/Default2.asp?lang=gb
origin: http://***.105.138.175/
date: Fri, 23 May 2008 20:01:36 GMT
ip: 162.105.138.175
length: 38413
HTTP/1.1 200 OK
Server: Microsoft-IIS/5.0
Date: Fri, 23 May 2008 11:17:49 GMT
Connection: keep-alive
Connection: Keep-Alive
Content-Length: 38088
Content-Type: text/html; Charset=gb2312
Expires: Fri, 23 May 2008 11:17:49 GMT
Set-Cookie: ASPSESSIONIDSSTRDCAB=IMEOMBIAIPDFCKPAEDJFHOIH; path=/
Cache-control: private
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"
<html>
<head>
<title>Apabi鏁板瓧璧勬簮騫沖彴</title>
<meta http-equiv="Content-Type" content="text/html; charset=gb2312">
<META NAME="ROBOTS" CONTENT="INDEX,NOFOLLOW">
<META NAME="DESCRIPTION" CONTENT="鏁板瓧鍥句功棣?鏂規鏁板瓧鍥句功棣?鐢靛瓙鍥句功 鐢靛瓙涔?ebook e涔?Apabi 鏁板瓧璧勬簮騫沖彴">
<link rel="stylesheet" type="text/css" href="css\common.css">
<style type="text/css">
<!--
.style4 {color: #666666}
-->
</style>
<script LANGUAGE="vbscript">
...
</script>
<Script Language="javascript">
...
</Script>
</head>
<body leftmargin="0" topmargin="0">
</body>
</html>
//Tianwang.raw.2559638448 end
//Tianwang.raw.2559638448.seg 灝嗘瘡涓〉闈㈠垎鎴愪竴琛屽涓?娉ㄦ剰涓棿娌℃湁鍥炶濺浣滀負鍒嗛殧)
1
...
...
...
2
...
...
...
//Tianwang.raw.2559638448.seg end
//涓嬫槸 Tiny search 闈炲繀欏誨洜绱?nbsp;
4. Create forward index (docic-->termid) //寤虹珛姝e悜绱㈠紩
./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx
//Tianwang.raw.2559638448.seg 灝嗘瘡涓〉闈㈠垎鎴愪竴琛屽涓?lt;BR>//鍒嗚瘝 DocID<BR>1<BR>涓夋槦/ s/ 鎵嬫満/ 璁哄潧/ ,/ 鎵嬫満/ 閾冨0/ 涓嬭澆/ ,/ 鎵嬫満/ 鍥劇墖/ 涓嬭澆/ ,/ 鎵嬫満/<BR>2<BR>...<BR>...<BR>...
1. The document index (Doc.idx) keeps information about each document.
It is a fixed width ISAM (Index sequential access mode) index, orderd by docID.
The information stored in each entry includes a pointer into the repository,
a document length, a document checksum.
//Doc.idx 鏂囨。緙栧彿 鏂囨。闀垮害 checksum hash鐮?/p>
0 0 bc9ce846d7987c4534f53d423380ba70
1 76760 4f47a3cad91f7d35f4bb6b2a638420e5
2 141624 d019433008538f65329ae8e39b86026c
3 142350 5705b8f58110f9ad61b1321c52605795
//Doc.idx end
The url index (url.idx) is used to convert URLs into docIDs.
//url.idx
5c36868a9c5117eadbda747cbdb0725f 0
3272e136dd90263ee306a835c6c70d77 1
6b8601bb3bb9ab80f868d549b5c5a5f3 2
3f9eba99fa788954b5ff7f35a5db6e1f 3
//url.idx end
It is a list of URL checksums with their corresponding docIDs and is sorted by
checksum. In order to find the docID of a particular URL, the URL's checksum
is computed and a binary search is performed on the checksums file to find its
docID.
./DocIndex
got Doc.idx, Url.idx, DocId2Url.idx //Data鏂囦歡澶逛腑鐨凞oc.idx DocId2Url.idx鍜孌oc.idx涓?/p>
//DocId2Url.idx
0 http://*.*.edu.cn/index.aspx
1 http://*.*.edu.cn/showcontent1.jsp?NewsID=118
//DocId2Url.idx end
2. sort Url.idx|uniq > Url.idx.sort_uniq //Data鏂囦歡澶逛腑鐨刄rl.idx.sort_uniq
//Url.idx.sort_uniq
//瀵筯ash鍊艱繘琛屾帓搴?/p>
000bfdfd8b2dedd926b58ba00d40986b 1111
000c7e34b653b5135a2361c6818e48dc 1831
0019d12f438eec910a06a606f570fde8 366
0033f7c005ec776f67f496cd8bc4ae0d 2103
3. Segment document to terms, (with finding document according to the url)
./DocSegment Tianwang.raw.2559638448 //Tianwang.raw.2559638448涓虹埇鍥炴潵鐨勬枃浠?錛屾瘡涓〉闈㈠寘鍚玥ttp澶?/p>
got Tianwang.raw.2559638448.seg
//Tianwang.raw.2559638448 鐖彇鐨勫師濮嬬綉欏墊枃浠跺湪鏂囨。鍐呴儴姣忎竴涓枃妗d箣闂村簲璇ユ槸閫氳繃version錛?lt;/html>鍜屽洖杞﹀仛鏍囧織浣嶅垎鍓茬殑
version: 1.0
url: http://***.105.138.175/Default2.asp?lang=gb
origin: http://***.105.138.175/
date: Fri, 23 May 2008 20:01:36 GMT
ip: 162.105.138.175
length: 38413
HTTP/1.1 200 OK
Server: Microsoft-IIS/5.0
Date: Fri, 23 May 2008 11:17:49 GMT
Connection: keep-alive
Connection: Keep-Alive
Content-Length: 38088
Content-Type: text/html; Charset=gb2312
Expires: Fri, 23 May 2008 11:17:49 GMT
Set-Cookie: ASPSESSIONIDSSTRDCAB=IMEOMBIAIPDFCKPAEDJFHOIH; path=/
Cache-control: private
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1銆佸瓧鍏哥殑閫夊彇錛堜簨瀹炰笂鏍規嵁涓嶅悓鏃朵唬涓嶅悓鍦版柟浜轟滑鐨勮璦涔犳儻鏄笉涓鏍風殑鎵浠ヨ瀛楀吀鐨勬渶灝忓厓鐨勫彇鍊兼槸涓嶅悓鐨勶級
2銆佸掓帓绱㈠紩鐨勫緩绔嬶紙榪欓噷灝辮娑夊強鍒扮埇铏殑鎶撳彇鍜岀儲寮曠殑寤虹珛鍚庨潰灝嗛噸鐐逛粙緇嶈繖2鐐癸紝鎼滅儲寮曟搸鐨勬晥鐜囧拰鏈嶅姟璐ㄩ噺瀹炴晥鎬х摱棰堝湪榪欓噷錛?/p>
3銆佺浉鍏蟲у垎鏋愶紙瀵規姄鍥炴潵鐨勬枃妗e垎璇嶅緩绱㈠紩鍜岀敤鎴峰叧閿瓧鍒嗚瘝綆楁硶涓婅瀵瑰簲錛?/p>
鍚庨潰鏂囩珷浼氶噸鐐逛粙緇嶇埇铏殑鎶撳彇鍜岀儲寮曠殑寤虹珛銆?/p>
//TSESearch.cpp涓細
view plaincopy to clipboardprint?
CHzSeg iHzSeg; //include ChSeg/HzSeg.h
//
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //灝唃et鍒扮殑鏌ヨ鍙橀噺鍒嗚瘝鍒嗘垚 "鎴? 鐖? 浣犱滑/ 鐨? 鏍煎紡"
vector<STRING></STRING> vecTerm;
iQuery.ParseQuery(vecTerm); //灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑
set<STRING></STRING> setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
gettimeofday(&end_tv,&tz);
// search end
//鎼滅儲瀹屾瘯
CHzSeg iHzSeg; //include ChSeg/HzSeg.h
//
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //灝唃et鍒扮殑鏌ヨ鍙橀噺鍒嗚瘝鍒嗘垚 "鎴? 鐖? 浣犱滑/ 鐨? 鏍煎紡"
vector vecTerm;
iQuery.ParseQuery(vecTerm); //灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑
set setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
gettimeofday(&end_tv,&tz);
// search end
//鎼滅儲瀹屾瘯view plaincopy to clipboardprint?
鐪婥HzSeg 涓殑榪欎釜鏂規硶
鐪婥HzSeg 涓殑榪欎釜鏂規硶view plaincopy to clipboardprint?
//ChSeg/HzSeg.h
//ChSeg/HzSeg.hview plaincopy to clipboardprint?
/**
* 紼嬪簭緲昏瘧璇存槑
* 榪涗竴姝ュ噣鍖栨暟鎹紝杞崲姹夊瓧
* @access public
* @param CDict, string 鍙傛暟鐨勬眽瀛楄鏄?瀛楀吀錛屾煡璇㈠瓧絎︿覆
* @return string 0
*/
// process a sentence before segmentation
//鍦ㄥ垎璇嶅墠澶勭悊鍙ュ瓙
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
{
string s2="";
unsigned int i,len;
while (!s1.empty())
{
unsigned char ch=(unsigned char) s1[0];
if(ch<128)
{ // deal with ASCII
i=1;
len = s1.size();
while (i<LEN len="s1.length();" i="0;" 涓枃鏍囩偣絳夐潪姹夊瓧瀛楃="" if="" else="" yhf="" s1="s1.substr(i);" by="" added="" ch="=13)" s2="" cr=""></LEN>=161)
&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
&& (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
|| (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
|| (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
{
ii=i+2; // 鍋囧畾娌℃湁鍗婁釜姹夊瓧
}
if (i==0) ii=i+2;
// 涓嶅鐞嗕腑鏂囩┖鏍?nbsp;
if (!(ch==161 && (unsigned char)s1[1]==161))
{
if (i <= s1.size()) // yhf
// 鍏朵粬鐨勯潪姹夊瓧鍙屽瓧鑺傚瓧絎﹀彲鑳借繛緇緭鍑?nbsp;
s2 += s1.substr(0, i) + SEPARATOR;
else break; // yhf
}
if (i <= s1.size()) // yhf
s1s1=s1.substr(i);
else break; //yhf
continue;
}
}
// 浠ヤ笅澶勭悊姹夊瓧涓?nbsp;
i = 2;
len = s1.length();
while(i<LEN></LEN>=176)
// while(i<LEN></LEN>=128 && (unsigned char)s1[i]!=161)
i+=2;
s2+=SegmentHzStrMM(dict, s1.substr(0,i));
if (i <= len) // yhf
s1s1=s1.substr(i);
else break; // yhf
}
return s2;
}
/**
* 紼嬪簭緲昏瘧璇存槑
* 榪涗竴姝ュ噣鍖栨暟鎹紝杞崲姹夊瓧
* @access public
* @param CDict, string 鍙傛暟鐨勬眽瀛楄鏄?瀛楀吀錛屾煡璇㈠瓧絎︿覆
* @return string 0
*/
// process a sentence before segmentation
//鍦ㄥ垎璇嶅墠澶勭悊鍙ュ瓙
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
{
string s2="";
unsigned int i,len;
while (!s1.empty())
{
unsigned char ch=(unsigned char) s1[0];
if(ch<128)
{ // deal with ASCII
i=1;
len = s1.size();
while (i=161)
&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
&& (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
|| (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
|| (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
{
i=i+2; // 鍋囧畾娌℃湁鍗婁釜姹夊瓧
}
if (i==0) i=i+2;
// 涓嶅鐞嗕腑鏂囩┖鏍?br> if (!(ch==161 && (unsigned char)s1[1]==161))
{
if (i <= s1.size()) // yhf
// 鍏朵粬鐨勯潪姹夊瓧鍙屽瓧鑺傚瓧絎﹀彲鑳借繛緇緭鍑?br> s2 += s1.substr(0, i) + SEPARATOR;
else break; // yhf
}
if (i <= s1.size()) // yhf
s1=s1.substr(i);
else break; //yhf
continue;
}
}
// 浠ヤ笅澶勭悊姹夊瓧涓?/p>
i = 2;
len = s1.length();
while(i=176)
// while(i=128 && (unsigned char)s1[i]!=161)
i+=2;
s2+=SegmentHzStrMM(dict, s1.substr(0,i));
if (i <= len) // yhf
s1=s1.substr(i);
else break; // yhf
}
return s2;
}view plaincopy to clipboardprint?
view plaincopy to clipboardprint?
//Query.cpp
//Query.cppview plaincopy to clipboardprint?
<PRE class=csharp name="code">/**
* 紼嬪簭緲昏瘧璇存槑
* 灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑
*
* @access public
* @param vector<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細鍚戦噺瀹瑰櫒
* @return void
*/
void CQuery::ParseQuery(vector<STRING></STRING> &vecTerm)
{
string::size_type idx;
while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) {
vecTerm.push_back(m_sSegQuery.substr(0,idx));
m_sSegQuerym_sSegQuery = m_sSegQuery.substr(idx+3);
}
}
</PRE>
<PRE class=csharp name="code"> </PRE>
<PRE class=csharp name="code"><PRE class=csharp name="code">/**
* 紼嬪簭緲昏瘧璇存槑
* 鐩稿叧鎬у垎鏋愭煡璇紝鏋勯犵粨鏋滈泦鍚坰etRelevantRst //鐡墮鎵鍦?nbsp;
*
* @access public
* @param vector<STRING></STRING> map set<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細 鐢ㄦ埛鎻愪氦鍏抽敭瀛楃殑鍒嗚瘝緇勶紝鍊掓帓绱㈠紩鏄犲皠錛岀浉鍏蟲х粨鏋滈泦鍚?nbsp;
* @return string 0
*/
bool CQuery::GetRelevantRst
(
vector<STRING></STRING> &vecTerm,
map &mapBuckets,
set<STRING></STRING> &setRelevantRst
) const
{
set<STRING></STRING> setSRst;
bool bFirst=true;
vector<STRING></STRING>::iterator itTerm = vecTerm.begin();
for ( ; itTerm != vecTerm.end(); ++itTerm )
{
setSRst.clear();
copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));
map mapRstDoc;
string docid;
int doccnt;
map::iterator itBuckets = mapBuckets.find(*itTerm);
if (itBuckets != mapBuckets.end())
{
string strBucket = (*itBuckets).second;
string::size_type idx;
idx = strBucket.find_first_not_of(" ");
strBucketstrBucket = strBucket.substr(idx);
while ( (idx = strBucket.find(" ")) != string::npos )
{
docid = strBucket.substr(0,idx);
doccnt = 0;
if (docid.empty()) continue;
map::iterator it = mapRstDoc.find(docid);
if ( it != mapRstDoc.end() )
{
doccnt = (*it).second + 1;
mapRstDoc.erase(it);
}
mapRstDoc.insert( pair(docid,doccnt) );
strBucketstrBucket = strBucket.substr(idx+1);
}
// remember the last one
docid = strBucket;
doccnt = 0;
map::iterator it = mapRstDoc.find(docid);
if ( it != mapRstDoc.end() )
{
doccnt = (*it).second + 1;
mapRstDoc.erase(it);
}
mapRstDoc.insert( pair(docid,doccnt) );
}
// sort by term frequencty
multimap > newRstDoc;
map::iterator it0 = mapRstDoc.begin();
for ( ; it0 != mapRstDoc.end(); ++it0 ){
newRstDoc.insert( pair((*it0).second,(*it0).first) );
}
multimap::iterator itNewRstDoc = newRstDoc.begin();
setRelevantRst.clear();
for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){
string docid = (*itNewRstDoc).second;
if (bFirst==true) {
setRelevantRst.insert(docid);
continue;
}
if ( setSRst.find(docid) != setSRst.end() ){
setRelevantRst.insert(docid);
}
}
//cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";
bFirst = false;
}
return true;
}</PRE>
</PRE>
鎺ヤ笅鏉ョ殑灝辨槸鐜板疄浜嗭紝鍓嶉潰閮藉彧鏄鐞嗘暟鎹緱鍒?setRelevantRst 榪欎釜鏌ヨ緇撴瀯闆嗗悎,榪欓噷灝變笉澶氳浜嗕笅闈㈠氨鍜宲hp涔嬬被鐨勮剼鏈璦宸笉澶氾紝鏍煎紡鍖栫粨鏋滈泦鍚堝茍鏄劇ず鍑烘潵銆?nbsp;
view plaincopy to clipboardprint?/** * 紼嬪簭緲昏瘧璇存槑 * 灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑 * * @access public * @param vector<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細鍚戦噺瀹瑰櫒 * @return void */ void CQuery::ParseQuery(vector<STRING></STRING> &vecTerm) { string::size_type idx; while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) { vecTerm.push_back(m_sSegQuery.substr(0,idx)); m_sSegQuery = m_sSegQuery.substr(idx+3); } } /**
* 紼嬪簭緲昏瘧璇存槑
* 灝嗕互"/"鍒掑垎寮鐨勫叧閿瓧涓涓欏哄簭鏀懼叆涓涓悜閲忓鍣ㄤ腑
*
* @access public
* @param vector 鍙傛暟鐨勬眽瀛楄鏄庯細鍚戦噺瀹瑰櫒
* @return void
*/
void CQuery::ParseQuery(vector &vecTerm)
{
string::size_type idx;
while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) {
vecTerm.push_back(m_sSegQuery.substr(0,idx));
m_sSegQuery = m_sSegQuery.substr(idx+3);
}
}
view plaincopy to clipboardprint?
view plaincopy to clipboardprint?<PRE class=csharp name="code">/** * 紼嬪簭緲昏瘧璇存槑 * 鐩稿叧鎬у垎鏋愭煡璇紝鏋勯犵粨鏋滈泦鍚坰etRelevantRst //鐡墮鎵鍦?nbsp; * * @access public * @param vector<STRING></STRING> map set<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細 鐢ㄦ埛鎻愪氦鍏抽敭瀛楃殑鍒嗚瘝緇勶紝鍊掓帓绱㈠紩鏄犲皠錛岀浉鍏蟲х粨鏋滈泦鍚?nbsp; * @return string 0 */ bool CQuery::GetRelevantRst ( vector<STRING></STRING> &vecTerm, map &mapBuckets, set<STRING></STRING> &setRelevantRst ) const { set<STRING></STRING> setSRst; bool bFirst=true; vector<STRING></STRING>::iterator itTerm = vecTerm.begin(); for ( ; itTerm != vecTerm.end(); ++itTerm ) { setSRst.clear(); copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin())); map mapRstDoc; string docid; int doccnt; map::iterator itBuckets = mapBuckets.find(*itTerm); if (itBuckets != mapBuckets.end()) { string strBucket = (*itBuckets).second; string::size_type idx; idx = strBucket.find_first_not_of(" "); strBucket = strBucket.substr(idx); while ( (idx = strBucket.find(" ")) != string::npos ) { docid = strBucket.substr(0,idx); doccnt = 0; if (docid.empty()) continue; map::iterator it = mapRstDoc.find(docid); if ( it != mapRstDoc.end() ) { doccnt = (*it).second + 1; mapRstDoc.erase(it); } mapRstDoc.insert( pair(docid,doccnt) ); strBucket = strBucket.substr(idx+1); } // remember the last one docid = strBucket; doccnt = 0; map::iterator it = mapRstDoc.find(docid); if ( it != mapRstDoc.end() ) { doccnt = (*it).second + 1; mapRstDoc.erase(it); } mapRstDoc.insert( pair(docid,doccnt) ); } // sort by term frequencty multimap > newRstDoc; map::iterator it0 = mapRstDoc.begin(); for ( ; it0 != mapRstDoc.end(); ++it0 ){ newRstDoc.insert( pair((*it0).second,(*it0).first) ); } multimap::iterator itNewRstDoc = newRstDoc.begin(); setRelevantRst.clear(); for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){ string docid = (*itNewRstDoc).second; if (bFirst==true) { setRelevantRst.insert(docid); continue; } if ( setSRst.find(docid) != setSRst.end() ){ setRelevantRst.insert(docid); } } //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>"; bFirst = false; } return true; }</PRE> view plaincopy to clipboardprint?/** * 紼嬪簭緲昏瘧璇存槑 * 鐩稿叧鎬у垎鏋愭煡璇紝鏋勯犵粨鏋滈泦鍚坰etRelevantRst //鐡墮鎵鍦?nbsp; * * @access public * @param vector<STRING></STRING> map set<STRING></STRING> 鍙傛暟鐨勬眽瀛楄鏄庯細 鐢ㄦ埛鎻愪氦鍏抽敭瀛楃殑鍒嗚瘝緇勶紝鍊掓帓绱㈠紩鏄犲皠錛岀浉鍏蟲х粨鏋滈泦鍚?nbsp; * @return string 0 */ bool CQuery::GetRelevantRst ( vector<STRING></STRING> &vecTerm, map &mapBuckets, set<STRING></STRING> &setRelevantRst ) const { set<STRING></STRING> setSRst; bool bFirst=true; vector<STRING></STRING>::iterator itTerm = vecTerm.begin(); for ( ; itTerm != vecTerm.end(); ++itTerm ) { setSRst.clear(); copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin())); map mapRstDoc; string docid; int doccnt; map::iterator itBuckets = mapBuckets.find(*itTerm); if (itBuckets != mapBuckets.end()) { string strBucket = (*itBuckets).second; string::size_type idx; idx = strBucket.find_first_not_of(" "); strBucket = strBucket.substr(idx); while ( (idx = strBucket.find(" ")) != string::npos ) { docid = strBucket.substr(0,idx); doccnt = 0; if (docid.empty()) continue; map::iterator it = mapRstDoc.find(docid); if ( it != mapRstDoc.end() ) { doccnt = (*it).second + 1; mapRstDoc.erase(it); } mapRstDoc.insert( pair(docid,doccnt) ); strBucket = strBucket.substr(idx+1); } // remember the last one docid = strBucket; doccnt = 0; map::iterator it = mapRstDoc.find(docid); if ( it != mapRstDoc.end() ) { doccnt = (*it).second + 1; mapRstDoc.erase(it); } mapRstDoc.insert( pair(docid,doccnt) ); } // sort by term frequencty multimap > newRstDoc; map::iterator it0 = mapRstDoc.begin(); for ( ; it0 != mapRstDoc.end(); ++it0 ){ newRstDoc.insert( pair((*it0).second,(*it0).first) ); } multimap::iterator itNewRstDoc = newRstDoc.begin(); setRelevantRst.clear(); for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){ string docid = (*itNewRstDoc).second; if (bFirst==true) { setRelevantRst.insert(docid); continue; } if ( setSRst.find(docid) != setSRst.end() ){ setRelevantRst.insert(docid); } } //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>"; bFirst = false; } return true; } /**
* 紼嬪簭緲昏瘧璇存槑
* 鐩稿叧鎬у垎鏋愭煡璇紝鏋勯犵粨鏋滈泦鍚坰etRelevantRst //鐡墮鎵鍦?br> *
* @access public
* @param vector map set 鍙傛暟鐨勬眽瀛楄鏄庯細 鐢ㄦ埛鎻愪氦鍏抽敭瀛楃殑鍒嗚瘝緇勶紝鍊掓帓绱㈠紩鏄犲皠錛岀浉鍏蟲х粨鏋滈泦鍚?br> * @return string 0
*/
bool CQuery::GetRelevantRst
(
vector &vecTerm,
map &mapBuckets,
set &setRelevantRst
) const
{
set setSRst;
bool bFirst=true;
vector::iterator itTerm = vecTerm.begin();
for ( ; itTerm != vecTerm.end(); ++itTerm )
{
setSRst.clear();
copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));
map mapRstDoc;
string docid;
int doccnt;
map::iterator itBuckets = mapBuckets.find(*itTerm);
if (itBuckets != mapBuckets.end())
{
string strBucket = (*itBuckets).second;
string::size_type idx;
idx = strBucket.find_first_not_of(" ");
strBucket = strBucket.substr(idx);
while ( (idx = strBucket.find(" ")) != string::npos )
{
docid = strBucket.substr(0,idx);
doccnt = 0;
if (docid.empty()) continue;
map::iterator it = mapRstDoc.find(docid);
if ( it != mapRstDoc.end() )
{
doccnt = (*it).second + 1;
mapRstDoc.erase(it);
}
mapRstDoc.insert( pair(docid,doccnt) );
strBucket = strBucket.substr(idx+1);
}
// remember the last one
docid = strBucket;
doccnt = 0;
map::iterator it = mapRstDoc.find(docid);
if ( it != mapRstDoc.end() )
{
doccnt = (*it).second + 1;
mapRstDoc.erase(it);
}
mapRstDoc.insert( pair(docid,doccnt) );
}
// sort by term frequencty
multimap > newRstDoc;
map::iterator it0 = mapRstDoc.begin();
for ( ; it0 != mapRstDoc.end(); ++it0 ){
newRstDoc.insert( pair((*it0).second,(*it0).first) );
}
multimap::iterator itNewRstDoc = newRstDoc.begin();
setRelevantRst.clear();
for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){
string docid = (*itNewRstDoc).second;
if (bFirst==true) {
setRelevantRst.insert(docid);
continue;
}
if ( setSRst.find(docid) != setSRst.end() ){
setRelevantRst.insert(docid);
}
}
//cout << "setRelevantRst.size(): " << setRelevantRst.size() << "";
bFirst = false;
}
return true;
}
鎺ヤ笅鏉ョ殑灝辨槸鐜板疄浜嗭紝鍓嶉潰閮藉彧鏄鐞嗘暟鎹緱鍒?setRelevantRst 榪欎釜鏌ヨ緇撴瀯闆嗗悎,榪欓噷灝變笉澶氳浜嗕笅闈㈠氨鍜宲hp涔嬬被鐨勮剼鏈璦宸笉澶氾紝鏍煎紡鍖栫粨鏋滈泦鍚堝茍鏄劇ず鍑烘潵銆?br>//TSESearch.cpp
view plaincopy to clipboardprint?
//涓嬮潰寮濮嬫樉紺?nbsp;
CDisplayRst iDisplayRst;
iDisplayRst.ShowTop();
float used_msec = (end_tv.tv_sec-begin_tv.tv_sec)*1000
+((float)(end_tv.tv_usec-begin_tv.tv_usec))/(float)1000;
iDisplayRst.ShowMiddle(iQuery.m_sQuery,used_msec,
setRelevantRst.size(), iQuery.m_iStart);
iDisplayRst.ShowBelow(vecTerm,setRelevantRst,vecDocIdx,iQuery.m_iStart);