??xml version="1.0" encoding="utf-8" standalone="yes"?> int main(int argc, char* argv[]) //./CrtInvertedIdx moon.fidx.sort > sun.iidx string strLine,strDocNum,tmp1=""; if (tmp.size()<2 || tmp.size() > 8) continue; if (tmp1.empty()) tmp1=tmp; if (tmp == tmp1) cout << tmp1 << "\t" << strDocNum << endl; cnt++; return 0; q里输入 Tianwang.raw.*****QDoc.idxQUrl.idx.sort_uniq{三个文Ӟ输出一个Tianwang.raw.***.seg 分词完毕的文?/p>
int main(int argc, char* argv[]) //ifstream ifs("Tianwang.raw.2559638448"); ifstream ifsUrl("Url.idx.sort_uniq"); //排序q消重后的url字典 while (getline(ifsUrl,strLine)) //偏离url字典存入一个向量内存中 memset(chksum, 0, 33); while (getline(ifsDoc,strLine)) //偏离字典文g其攑օ一个向量内存中 memset(chksum, 0, 33); strFileName += ".seg"; // find document according to docId char *s; // skip Head //iDocument.m_sBody = s; delete[] pContent; CStrFun::ReplaceStr(strLine, " ", " "); return(0); 详细的文件功能和介绍都在q里有了介绍自顶向下学搜索引擎——北大天|搜索引擎TSE分析?qing)完全注释[5]倒排索引的徏立及(qing)文g介绍 CrtForwardIdx.cpp文g int main(int argc, char* argv[]) //./CrtForwardIdx Tianwang.raw.***.seg > moon.fidx string strLine,strDocNum; cnt++; while ( (idx = strLine.find(SEPARATOR)) != string::npos ) //指定查找分界W? //if (cnt==100) break; return 0; author:http://hi.baidu.com/jrckkyy author:http://blog.csdn.net/jrckkyy
{
ifstream ifsImgInfo(argv[1]);
if (!ifsImgInfo)
{
cerr << "Cannot open " << argv[1] << " for input\n";
return -1;
}
int cnt = 0;
while (getline(ifsImgInfo, strLine))
{
string::size_type idx;
string tmp;
idx = strLine.find("\t");
tmp = strLine.substr(0,idx);
{
strDocNum = strDocNum + " " + strLine.substr(idx+1);
}
else
{
if ( strDocNum.empty() )
strDocNum = strDocNum + " " + strLine.substr(idx+1);
tmp1 = tmp;
strDocNum.clear();
strDocNum = strDocNum + " " + strLine.substr(idx+1);
}
//if (cnt==100) break;
}
cout << tmp1 << "\t" << strDocNum << endl; //倒排索引中每个字典单词后的文档编号以table键ؓ(f)间隔
}
]]>
{
string strLine, strFileName=argv[1];
CUrl iUrl;
vector<CUrl> vecCUrl;
CDocument iDocument;
vector<CDocument> vecCDocument;
unsigned int docId = 0;
ifstream ifs(strFileName.c_str()); //DocSegment Tianwang.raw.****
if (!ifs)
{
cerr << "Cannot open tianwang.img.info for input\n";
return -1;
}
if (!ifsUrl)
{
cerr << "Cannot open Url.idx.sort_uniq for input\n";
return -1;
}
ifstream ifsDoc("Doc.idx"); //字典文g
if (!ifsDoc)
{
cerr << "Cannot open Doc.idx for input\n";
return -1;
}
{
char chksum[33];
int docid;
sscanf( strLine.c_str(), "%s%d", chksum, &docid );
iUrl.m_sChecksum = chksum;
iUrl.m_nDocId = docid;
vecCUrl.push_back(iUrl);
}
{
int docid,pos,length;
char chksum[33];
sscanf( strLine.c_str(), "%d%d%d%s", &docid, &pos, &length,chksum );
iDocument.m_nDocId = docid;
iDocument.m_nPos = pos;
iDocument.m_nLength = length;
iDocument.m_sChecksum = chksum;
vecCDocument.push_back(iDocument);
}
ofstream fout(strFileName.c_str(), ios::in|ios::out|ios::trunc|ios::binary); //讄完成分词后的数据输出文g
for ( docId=0; docId<MAX_DOC_ID; docId++ )
{
int length = vecCDocument[docId+1].m_nPos - vecCDocument[docId].m_nPos -1;
char *pContent = new char[length+1];
memset(pContent, 0, length+1);
ifs.seekg(vecCDocument[docId].m_nPos);
ifs.read(pContent, length);
s = pContent;
int bytesRead = 0,newlines = 0;
while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1)
{
if (*s == '\n')
newlines++;
else
newlines = 0;
s++;
bytesRead++;
}
if (bytesRead == HEADER_BUF_SIZE-1) continue;
// skip header
bytesRead = 0,newlines = 0;
while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1)
{
if (*s == '\n')
newlines++;
else
newlines = 0;
s++;
bytesRead++;
}
if (bytesRead == HEADER_BUF_SIZE-1) continue;
iDocument.RemoveTags(s); //去除<>
iDocument.m_sBodyNoTags = s;
string strLine = iDocument.m_sBodyNoTags;
CStrFun::EmptyStr(strLine); // set " \t\r\n" to " "
// segment the document 具体分词处理
CHzSeg iHzSeg;
strLine = iHzSeg.SegmentSentenceMM(iDict,strLine);
fout << docId << endl << strLine;
fout << endl;
}
}
q里只是光掠媄(jing)式的q一遍大概的代码Q后面我?x)有专题详细讲?parse html ?segment docment {技?/p>
]]>
{
ifstream ifsImgInfo(argv[1]);
if (!ifsImgInfo)
{
cerr << "Cannot open " << argv[1] << " for input\n";
return -1;
}
int cnt = 0;
while (getline(ifsImgInfo, strLine))
{
string::size_type idx;
if (cnt%2 == 1) //奇数行ؓ(f)文档~号
{
strDocNum = strLine.substr(0,strLine.size());
continue;
}
if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
{
continue;
}
{
string tmp1 = strLine.substr(0,idx);
cout << tmp1 << "\t" << strDocNum << endl;
strLine = strLine.substr(idx + SEPARATOR.size());
}
}
}
]]>
author:http://blog.csdn.net/jrckkyy
上一主要介l了倒排索引建立相关的文件及(qing)中间文g?br>TSE建立索引在运行程序上的大致步骤可以简化分Z下几步:(x)
1、运行命?./DocIndex
?x)用C个文?tianwang.raw.520 //爬取回来的原始文Ӟ包含多个|页的所有信息,所以很大,q也是一个有待解决的问题Q到底存成大文gQ如果过大会(x)过2G?G的限Ӟ而且文gq大索引效率q低Q还是小文gQ文件数q多用于打开关闭文g句柄的消耗过大)q有待思考,q就是存储方案的解决最l肯定是要存为分布式的,最lL仉肯定是会(x)上TB的,TSE只支持小型的搜烦(ch)引擎需求?nbsp;
?x)生一下三个文?Doc.idx, Url.idx, DocId2Url.idx //Data文g夹中的Doc.idx DocId2Url.idx和Doc.idx
2、运行命?sort Url.idx|uniq > Url.idx.sort_uniq //Data文g夹中的Url.idx.sort_uniq
?x)用C个文?Url.idx文g //md5 hash 之后的url完整地址和document id值对
?x)生一个文?Url.idx.sort_uniq //URL消重Qmd5 hash排序Q提高检索效?/p>
3、运行命?./DocSegment Tianwang.raw.2559638448
?x)用C个文?Tianwang.raw.2559638448 //Tianwang.raw.2559638448为爬回来的文?Q每个页面包含http_(d)分词为后面徏立到排烦(ch)引做准备
?x)生一个文?Tianwang.raw.2559638448.seg //分词文gQ由一行document id号和一行文档分词组Q只Ҏ(gu)个文?lt;html></html>?lt;head></head><body></body>{文字标C的文本进行分l)构成
4、运行命?./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx //建立独立的正向烦(ch)?/p>
5、运行命?br>#set | grep "LANG"
#LANG=en; export LANG;
#sort moon.fidx > moon.fidx.sort
6、运行命?./CrtInvertedIdx moon.fidx.sort > sun.iidx //建立倒排索引
我们先从建立索引的第一个程序DocIndex.cpp开始分析?注释U定QTianwang.raw.2559638448是抓回来合ƈ成的大文Ӟ后面叫大文Ӟ里面包含了很多篇html文档Q里面的文档有规律的分隔叫做一一的文档)
//DocIndex.h start-------------------------------------------------------------
#ifndef _COMM_H_040708_
#define _COMM_H_040708_
#include
#include
#include
#include
#include
#include
#include
#include
using namespace std;
const unsigned HEADER_BUF_SIZE = 1024;
const unsigned RstPerPage = 20; //前台搜烦(ch)l果数据集返回条?/p>
//iceway
//const unsigned MAX_DOC_IDX_ID = 21312; //DocSegment.cpp中要用到
const unsigned MAX_DOC_IDX_ID = 22104;
//const string IMG_INFO_NAME("./Data/s1.1");
const string INF_INFO_NAME("./Data/sun.iidx"); //倒排索引文g
//朱d 14383 16151 16151 16151 1683 207 6302 7889 8218 8218 8637
//朱古?nbsp; 1085 1222
//9万多?字元文g 包括Ҏ(gu)W号Q标点,汉字
const string DOC_IDX_NAME("./Data/Doc.idx"); //倒排索引文g
const string RAWPAGE_FILE_NAME("./Data/Tianwang.swu.iceway.1.0");
//iceway
const string DOC_FILE_NAME = "Tianwang.swu.iceway.1.0"; //Docindex.cpp中要用到
const string Data_DOC_FILE_NAME = "./Data/Tianwang.swu.iceway.1.0"; //Snapshot.cpp中要用到
//const string RM_THUMBNAIL_FILES("rm -f ~/public_html/ImgSE/timg/*");
//const string THUMBNAIL_DIR("/ImgSE/timg/");
#endif _COMM_H_040708_
//DocIndex.h end--------------------------------------------------------------//DocIndex.cpp start-----------------------------------------------------------
#include
#include
#include "Md5.h"
#include "Url.h"
#include "Document.h"
//iceway(mnsc)
#include "Comm.h"
#include
using namespace std;
int main(int argc, char* argv[])
{
//ifstream ifs("Tianwang.raw.2559638448");
//ifstream ifs("Tianwang.raw.3023555472");
//iceway(mnsc)
ifstream ifs(DOC_FILE_NAME.c_str()); //打开Tianwang.raw.3023555472文gQ最原始的文?br> if (!ifs)
{
cerr << "Cannot open " << "tianwang.img.info" << " for input\n";
return -1;
}
ofstream ofsUrl("Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //建立q打开Url.idx文g
if( !ofsUrl )
{
cout << "error open file " << endl;
}
ofstream ofsDoc("Doc.idx", ios::in|ios::out|ios::trunc|ios::binary); //建立q打开Doc.idx文g
if( !ofsDoc )
{
cout << "error open file " << endl;
}
ofstream ofsDocId2Url("DocId2Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //建立q打开DocId2Url.idx文g
if( !ofsDocId2Url )
{
cout << "error open file " << endl;
}
int cnt=0; //文档~号?开始计?br> string strLine,strPage;
CUrl iUrl;
CDocument iDocument;
CMD5 iMD5;
int nOffset = ifs.tellg();
while (getline(ifs, strLine))
{
if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
{
nOffset = ifs.tellg();
continue;
}
if (!strncmp(strLine.c_str(), "version: 1.0", 12)) //判断W一行是否是version: 1.0如果是就解析下去
{
if(!getline(ifs, strLine)) break;
if (!strncmp(strLine.c_str(), "url: ", 4)) //判断W二行是否是url: 如果是则解析下去
{
iUrl.m_sUrl = strLine.substr(5); //截取url: 五个字符之后的url内容
iMD5.GenerateMD5( (unsigned char*)iUrl.m_sUrl.c_str(), iUrl.m_sUrl.size() ); //对url用md5 hash处理
iUrl.m_sChecksum = iMD5.ToString(); //字W数l组合成字符串这个函数在Md5.h中实?/p>
} else
{
continue;
}
while (getline(ifs, strLine))
{
if (!strncmp(strLine.c_str(), "length: ", 8)) //一直读下去直到判断Ҏ(gu)(相对W五?惺欠袯ength: 是则接下下去
{
sscanf(strLine.substr(8).c_str(), "%d", &(iDocument.m_nLength)); //该块所代表|页的实际网内定w度放入iDocument数据l构?br> break;
}
}
getline(ifs, strLine); //跌相对W六行故意留的一个空?/p>
iDocument.m_nDocId = cnt; //文档编可值到iDocument数据l构?br> iDocument.m_nPos = nOffset; //文档l尾在大文g中的l束行号
char *pContent = new char[iDocument.m_nLength+1]; //新徏该文档长度的字符串指?/p>
memset(pContent, 0, iDocument.m_nLength+1); //每一位初始化?
ifs.read(pContent, iDocument.m_nLength); //Ҏ(gu)获得的文档长度读取澹(其中包含协议?d文档内容
iMD5.GenerateMD5( (unsigned char*)pContent, iDocument.m_nLength );
iDocument.m_sChecksum = iMD5.ToString(); //字W数l组合成字符串这个函数在Md5.h中实?br>
delete[] pContent;
ofsUrl << iUrl.m_sChecksum ; //md5hash后的url写入Url.idx文g
ofsUrl << "\t" << iDocument.m_nDocId << endl; //在一行中一个tab距离分隔Q将文g~号写入Url.idx文g
ofsDoc << iDocument.m_nDocId ; //文件编号写入Doc.idx文g
ofsDoc << "\t" << iDocument.m_nPos ; //在一行中一个tab距离分隔Q将该文档结束行h(同样也是下一文档开始行?写入Doc.idx文g
//ofsDoc << "\t" << iDocument.m_nLength ;
ofsDoc << "\t" << iDocument.m_sChecksum << endl; //在一行中一个tab距离分隔Q将md5hash后的url写入Doc.idx文g
ofsDocId2Url << iDocument.m_nDocId ; //文件编号写入DocId2Url.idx文g
ofsDocId2Url << "\t" << iUrl.m_sUrl << endl; //该文档的完整url写入DocId2Url.idx文g
cnt++; //文档~号加一说明该以文档分析完毕Q生成下一文档的编?br> }
nOffset = ifs.tellg();
}
//最后一行只有文档号和上一文档结束号
ofsDoc << cnt ;
ofsDoc << "\t" << nOffset << endl;
return(0);
}
//DocIndex.cpp end-----------------------------------------------------------author:http://hi.baidu.com/jrckkyy
author:http://blog.csdn.net/jrckkyy
TSE用的是将抓取回来的网|档全部装入一个大文档Q让后对q一个大文档内的数据整体l一的徏索引Q其中包含了几个步骤?/p>
view plaincopy to clipboardprint?
1. The document index (Doc.idx) keeps information about each document.
It is a fixed width ISAM (Index sequential access mode) index, orderd by docID.
The information stored in each entry includes a pointer into the repository,
a document length, a document checksum.
//Doc.idx 文档~号 文档长度 checksum hash?nbsp;
0 0 bc9ce846d7987c4534f53d423380ba70
1 76760 4f47a3cad91f7d35f4bb6b2a638420e5
2 141624 d019433008538f65329ae8e39b86026c
3 142350 5705b8f58110f9ad61b1321c52605795
//Doc.idx end
The url index (url.idx) is used to convert URLs into docIDs.
//url.idx
5c36868a9c5117eadbda747cbdb0725f 0
3272e136dd90263ee306a835c6c70d77 1
6b8601bb3bb9ab80f868d549b5c5a5f3 2
3f9eba99fa788954b5ff7f35a5db6e1f 3
//url.idx end
It is a list of URL checksums with their corresponding docIDs and is sorted by
checksum. In order to find the docID of a particular URL, the URL's checksum
is computed and a binary search is performed on the checksums file to find its
docID.
./DocIndex
got Doc.idx, Url.idx, DocId2Url.idx //Data文g夹中的Doc.idx DocId2Url.idx和Doc.idx?nbsp;
//DocId2Url.idx
0 http://*.*.edu.cn/index.aspx
1 http://*.*.edu.cn/showcontent1.jsp?NewsID=118
2 http://*.*.edu.cn/0102.html
3 http://*.*.edu.cn/0103.html
//DocId2Url.idx end
2. sort Url.idx|uniq > Url.idx.sort_uniq //Data文g夹中的Url.idx.sort_uniq
//Url.idx.sort_uniq
//对hashD行排?nbsp;
000bfdfd8b2dedd926b58ba00d40986b 1111
000c7e34b653b5135a2361c6818e48dc 1831
0019d12f438eec910a06a606f570fde8 366
0033f7c005ec776f67f496cd8bc4ae0d 2103
3. Segment document to terms, (with finding document according to the url)
./DocSegment Tianwang.raw.2559638448 //Tianwang.raw.2559638448为爬回来的文?Q每个页面包含http?nbsp;
got Tianwang.raw.2559638448.seg
//Tianwang.raw.2559638448 爬取的原始网|件在文档内部每一个文档之间应该是通过versionQ?lt;/html>和回车做标志位分割的
version: 1.0
url: http://***.105.138.175/Default2.asp?lang=gb
origin: http://***.105.138.175/
date: Fri, 23 May 2008 20:01:36 GMT
ip: 162.105.138.175
length: 38413
HTTP/1.1 200 OK
Server: Microsoft-IIS/5.0
Date: Fri, 23 May 2008 11:17:49 GMT
Connection: keep-alive
Connection: Keep-Alive
Content-Length: 38088
Content-Type: text/html; Charset=gb2312
Expires: Fri, 23 May 2008 11:17:49 GMT
Set-Cookie: ASPSESSIONIDSSTRDCAB=IMEOMBIAIPDFCKPAEDJFHOIH; path=/
Cache-control: private
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"
<html>
<head>
<title>Apabi数字资源q_</title>
<meta http-equiv="Content-Type" content="text/html; charset=gb2312">
<META NAME="ROBOTS" CONTENT="INDEX,NOFOLLOW">
<META NAME="DESCRIPTION" CONTENT="数字图书?Ҏ(gu)数字图书??sh)子图??sh)子?ebook e?Apabi 数字资源q_">
<link rel="stylesheet" type="text/css" href="css\common.css">
<style type="text/css">
<!--
.style4 {color: #666666}
-->
</style>
<script LANGUAGE="vbscript">
...
</script>
<Script Language="javascript">
...
</Script>
</head>
<body leftmargin="0" topmargin="0">
</body>
</html>
//Tianwang.raw.2559638448 end
//Tianwang.raw.2559638448.seg 每个页面分成一行如?注意中间没有回R作ؓ(f)分隔)
1
...
...
...
2
...
...
...
//Tianwang.raw.2559638448.seg end
//下是 Tiny search 非必d?nbsp;
4. Create forward index (docic-->termid) //建立正向索引
./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx
//Tianwang.raw.2559638448.seg 每个页面分成一行如?lt;BR>//分词 DocID<BR>1<BR>三星/ s/ 手机/ 论坛/ ,/ 手机/ 铃声/ 下蝲/ ,/ 手机/ 囄/ 下蝲/ ,/ 手机/<BR>2<BR>...<BR>...<BR>...
1. The document index (Doc.idx) keeps information about each document.
It is a fixed width ISAM (Index sequential access mode) index, orderd by docID.
The information stored in each entry includes a pointer into the repository,
a document length, a document checksum.
//Doc.idx 文档~号 文档长度 checksum hash?/p>
0 0 bc9ce846d7987c4534f53d423380ba70
1 76760 4f47a3cad91f7d35f4bb6b2a638420e5
2 141624 d019433008538f65329ae8e39b86026c
3 142350 5705b8f58110f9ad61b1321c52605795
//Doc.idx end
The url index (url.idx) is used to convert URLs into docIDs.
//url.idx
5c36868a9c5117eadbda747cbdb0725f 0
3272e136dd90263ee306a835c6c70d77 1
6b8601bb3bb9ab80f868d549b5c5a5f3 2
3f9eba99fa788954b5ff7f35a5db6e1f 3
//url.idx end
It is a list of URL checksums with their corresponding docIDs and is sorted by
checksum. In order to find the docID of a particular URL, the URL's checksum
is computed and a binary search is performed on the checksums file to find its
docID.
./DocIndex
got Doc.idx, Url.idx, DocId2Url.idx //Data文g夹中的Doc.idx DocId2Url.idx和Doc.idx?/p>
//DocId2Url.idx
0 http://*.*.edu.cn/index.aspx
1 http://*.*.edu.cn/showcontent1.jsp?NewsID=118
//DocId2Url.idx end
2. sort Url.idx|uniq > Url.idx.sort_uniq //Data文g夹中的Url.idx.sort_uniq
//Url.idx.sort_uniq
//对hashD行排?/p>
000bfdfd8b2dedd926b58ba00d40986b 1111
000c7e34b653b5135a2361c6818e48dc 1831
0019d12f438eec910a06a606f570fde8 366
0033f7c005ec776f67f496cd8bc4ae0d 2103
3. Segment document to terms, (with finding document according to the url)
./DocSegment Tianwang.raw.2559638448 //Tianwang.raw.2559638448为爬回来的文?Q每个页面包含http?/p>
got Tianwang.raw.2559638448.seg
//Tianwang.raw.2559638448 爬取的原始网|件在文档内部每一个文档之间应该是通过versionQ?lt;/html>和回车做标志位分割的
version: 1.0
url: http://***.105.138.175/Default2.asp?lang=gb
origin: http://***.105.138.175/
date: Fri, 23 May 2008 20:01:36 GMT
ip: 162.105.138.175
length: 38413
HTTP/1.1 200 OK
Server: Microsoft-IIS/5.0
Date: Fri, 23 May 2008 11:17:49 GMT
Connection: keep-alive
Connection: Keep-Alive
Content-Length: 38088
Content-Type: text/html; Charset=gb2312
Expires: Fri, 23 May 2008 11:17:49 GMT
Set-Cookie: ASPSESSIONIDSSTRDCAB=IMEOMBIAIPDFCKPAEDJFHOIH; path=/
Cache-control: private
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1、字典的选取Q事实上Ҏ(gu)不同时代不同地方Z的语a?fn)惯是不一L(fng)所以说字典的最元的取值是不同的)
2、倒排索引的徏立(q里p涉及(qing)到爬虫的抓取和烦(ch)引的建立后面重点介l这2点,搜烦(ch)引擎的效率和服务质量实效性瓶颈在q里Q?/p>
3、相x分析(Ҏ(gu)回来的文档分词徏索引和用户关键字分词法上要对应Q?/p>
后面文章?x)重点介l爬虫的抓取和烦(ch)引的建立?/p>
//TSESearch.cpp中:(x)
view plaincopy to clipboardprint?
CHzSeg iHzSeg; //include ChSeg/HzSeg.h
//
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //get到的查询变量分词分成 "? ? 你们/ ? 格式"
vector<STRING></STRING> vecTerm;
iQuery.ParseQuery(vecTerm); //以"/"划分开的关键字一一序攑օ一个向量容器中
set<STRING></STRING> setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
gettimeofday(&end_tv,&tz);
// search end
//搜烦(ch)完毕
CHzSeg iHzSeg; //include ChSeg/HzSeg.h
//
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //get到的查询变量分词分成 "? ? 你们/ ? 格式"
vector vecTerm;
iQuery.ParseQuery(vecTerm); //以"/"划分开的关键字一一序攑օ一个向量容器中
set setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
gettimeofday(&end_tv,&tz);
// search end
//搜烦(ch)完毕view plaincopy to clipboardprint?
看CHzSeg 中的q个Ҏ(gu)
看CHzSeg 中的q个Ҏ(gu)view plaincopy to clipboardprint?
//ChSeg/HzSeg.h
//ChSeg/HzSeg.hview plaincopy to clipboardprint?
/**
* E序译说明
* q一步净化数据,转换汉字
* @access public
* @param CDict, string 参数的汉字说?字典Q查询字W串
* @return string 0
*/
// process a sentence before segmentation
//在分词前处理句子
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
{
string s2="";
unsigned int i,len;
while (!s1.empty())
{
unsigned char ch=(unsigned char) s1[0];
if(ch<128)
{ // deal with ASCII
i=1;
len = s1.size();
while (i<LEN len="s1.length();" i="0;" 中文标点{非汉字字符="" if="" else="" yhf="" s1="s1.substr(i);" by="" added="" ch="=13)" s2="" cr=""></LEN>=161)
&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
&& (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
|| (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
|| (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
{
ii=i+2; // 假定没有半个汉字
}
if (i==0) ii=i+2;
// 不处理中文空?nbsp;
if (!(ch==161 && (unsigned char)s1[1]==161))
{
if (i <= s1.size()) // yhf
// 其他的非汉字双字节字W可能连l输?nbsp;
s2 += s1.substr(0, i) + SEPARATOR;
else break; // yhf
}
if (i <= s1.size()) // yhf
s1s1=s1.substr(i);
else break; //yhf
continue;
}
}
// 以下处理汉字?nbsp;
i = 2;
len = s1.length();
while(i<LEN></LEN>=176)
// while(i<LEN></LEN>=128 && (unsigned char)s1[i]!=161)
i+=2;
s2+=SegmentHzStrMM(dict, s1.substr(0,i));
if (i <= len) // yhf
s1s1=s1.substr(i);
else break; // yhf
}
return s2;
}
/**
* E序译说明
* q一步净化数据,转换汉字
* @access public
* @param CDict, string 参数的汉字说?字典Q查询字W串
* @return string 0
*/
// process a sentence before segmentation
//在分词前处理句子
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
{
string s2="";
unsigned int i,len;
while (!s1.empty())
{
unsigned char ch=(unsigned char) s1[0];
if(ch<128)
{ // deal with ASCII
i=1;
len = s1.size();
while (i=161)
&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
&& (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
&& (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
|| (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
|| (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
{
i=i+2; // 假定没有半个汉字
}
if (i==0) i=i+2;
// 不处理中文空?br> if (!(ch==161 && (unsigned char)s1[1]==161))
{
if (i <= s1.size()) // yhf
// 其他的非汉字双字节字W可能连l输?br> s2 += s1.substr(0, i) + SEPARATOR;
else break; // yhf
}
if (i <= s1.size()) // yhf
s1=s1.substr(i);
else break; //yhf
continue;
}
}
// 以下处理汉字?/p>
i = 2;
len = s1.length();
while(i=176)
// while(i=128 && (unsigned char)s1[i]!=161)
i+=2;
s2+=SegmentHzStrMM(dict, s1.substr(0,i));
if (i <= len) // yhf
s1=s1.substr(i);
else break; // yhf
}
return s2;
}view plaincopy to clipboardprint?
view plaincopy to clipboardprint?
//Query.cpp
//Query.cppview plaincopy to clipboardprint?
<PRE class=csharp name="code">/**
* E序译说明
* 以"/"划分开的关键字一一序攑օ一个向量容器中
*
* @access public
* @param vector<STRING></STRING> 参数的汉字说明:(x)向量容器
* @return void
*/
void CQuery::ParseQuery(vector<STRING></STRING> &vecTerm)
{
string::size_type idx;
while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) {
vecTerm.push_back(m_sSegQuery.substr(0,idx));
m_sSegQuerym_sSegQuery = m_sSegQuery.substr(idx+3);
}
}
</PRE>
<PRE class=csharp name="code"> </PRE>
<PRE class=csharp name="code"><PRE class=csharp name="code">/**
* E序译说明
* 相关性分析查询,构造结果集合setRelevantRst //瓉所?nbsp;
*
* @access public
* @param vector<STRING></STRING> map set<STRING></STRING> 参数的汉字说明:(x) 用户提交关键字的分词l,倒排索引映射Q相x结果集?nbsp;
* @return string 0
*/
bool CQuery::GetRelevantRst
(
vector<STRING></STRING> &vecTerm,
map &mapBuckets,
set<STRING></STRING> &setRelevantRst
) const
{
set<STRING></STRING> setSRst;
bool bFirst=true;
vector<STRING></STRING>::iterator itTerm = vecTerm.begin();
for ( ; itTerm != vecTerm.end(); ++itTerm )
{
setSRst.clear();
copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));
map mapRstDoc;
string docid;
int doccnt;
map::iterator itBuckets = mapBuckets.find(*itTerm);
if (itBuckets != mapBuckets.end())
{
string strBucket = (*itBuckets).second;
string::size_type idx;
idx = strBucket.find_first_not_of(" ");
strBucketstrBucket = strBucket.substr(idx);
while ( (idx = strBucket.find(" ")) != string::npos )
{
docid = strBucket.substr(0,idx);
doccnt = 0;
if (docid.empty()) continue;
map::iterator it = mapRstDoc.find(docid);
if ( it != mapRstDoc.end() )
{
doccnt = (*it).second + 1;
mapRstDoc.erase(it);
}
mapRstDoc.insert( pair(docid,doccnt) );
strBucketstrBucket = strBucket.substr(idx+1);
}
// remember the last one
docid = strBucket;
doccnt = 0;
map::iterator it = mapRstDoc.find(docid);
if ( it != mapRstDoc.end() )
{
doccnt = (*it).second + 1;
mapRstDoc.erase(it);
}
mapRstDoc.insert( pair(docid,doccnt) );
}
// sort by term frequencty
multimap > newRstDoc;
map::iterator it0 = mapRstDoc.begin();
for ( ; it0 != mapRstDoc.end(); ++it0 ){
newRstDoc.insert( pair((*it0).second,(*it0).first) );
}
multimap::iterator itNewRstDoc = newRstDoc.begin();
setRelevantRst.clear();
for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){
string docid = (*itNewRstDoc).second;
if (bFirst==true) {
setRelevantRst.insert(docid);
continue;
}
if ( setSRst.find(docid) != setSRst.end() ){
setRelevantRst.insert(docid);
}
}
//cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";
bFirst = false;
}
return true;
}</PRE>
</PRE>
接下来的是现实了,前面都只是处理数据得?setRelevantRst q个查询l构集合,q里׃多说了下面就和php之类的脚本语a差不多,格式化结果集合ƈ昄出来?nbsp;
view plaincopy to clipboardprint?/** * E序译说明 * 以"/"划分开的关键字一一序攑օ一个向量容器中 * * @access public * @param vector<STRING></STRING> 参数的汉字说明:(x)向量容器 * @return void */ void CQuery::ParseQuery(vector<STRING></STRING> &vecTerm) { string::size_type idx; while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) { vecTerm.push_back(m_sSegQuery.substr(0,idx)); m_sSegQuery = m_sSegQuery.substr(idx+3); } } /**
* E序译说明
* 以"/"划分开的关键字一一序攑օ一个向量容器中
*
* @access public
* @param vector 参数的汉字说明:(x)向量容器
* @return void
*/
void CQuery::ParseQuery(vector &vecTerm)
{
string::size_type idx;
while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) {
vecTerm.push_back(m_sSegQuery.substr(0,idx));
m_sSegQuery = m_sSegQuery.substr(idx+3);
}
}
view plaincopy to clipboardprint?
view plaincopy to clipboardprint?<PRE class=csharp name="code">/** * E序译说明 * 相关性分析查询,构造结果集合setRelevantRst //瓉所?nbsp; * * @access public * @param vector<STRING></STRING> map set<STRING></STRING> 参数的汉字说明:(x) 用户提交关键字的分词l,倒排索引映射Q相x结果集?nbsp; * @return string 0 */ bool CQuery::GetRelevantRst ( vector<STRING></STRING> &vecTerm, map &mapBuckets, set<STRING></STRING> &setRelevantRst ) const { set<STRING></STRING> setSRst; bool bFirst=true; vector<STRING></STRING>::iterator itTerm = vecTerm.begin(); for ( ; itTerm != vecTerm.end(); ++itTerm ) { setSRst.clear(); copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin())); map mapRstDoc; string docid; int doccnt; map::iterator itBuckets = mapBuckets.find(*itTerm); if (itBuckets != mapBuckets.end()) { string strBucket = (*itBuckets).second; string::size_type idx; idx = strBucket.find_first_not_of(" "); strBucket = strBucket.substr(idx); while ( (idx = strBucket.find(" ")) != string::npos ) { docid = strBucket.substr(0,idx); doccnt = 0; if (docid.empty()) continue; map::iterator it = mapRstDoc.find(docid); if ( it != mapRstDoc.end() ) { doccnt = (*it).second + 1; mapRstDoc.erase(it); } mapRstDoc.insert( pair(docid,doccnt) ); strBucket = strBucket.substr(idx+1); } // remember the last one docid = strBucket; doccnt = 0; map::iterator it = mapRstDoc.find(docid); if ( it != mapRstDoc.end() ) { doccnt = (*it).second + 1; mapRstDoc.erase(it); } mapRstDoc.insert( pair(docid,doccnt) ); } // sort by term frequencty multimap > newRstDoc; map::iterator it0 = mapRstDoc.begin(); for ( ; it0 != mapRstDoc.end(); ++it0 ){ newRstDoc.insert( pair((*it0).second,(*it0).first) ); } multimap::iterator itNewRstDoc = newRstDoc.begin(); setRelevantRst.clear(); for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){ string docid = (*itNewRstDoc).second; if (bFirst==true) { setRelevantRst.insert(docid); continue; } if ( setSRst.find(docid) != setSRst.end() ){ setRelevantRst.insert(docid); } } //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>"; bFirst = false; } return true; }</PRE> view plaincopy to clipboardprint?/** * E序译说明 * 相关性分析查询,构造结果集合setRelevantRst //瓉所?nbsp; * * @access public * @param vector<STRING></STRING> map set<STRING></STRING> 参数的汉字说明:(x) 用户提交关键字的分词l,倒排索引映射Q相x结果集?nbsp; * @return string 0 */ bool CQuery::GetRelevantRst ( vector<STRING></STRING> &vecTerm, map &mapBuckets, set<STRING></STRING> &setRelevantRst ) const { set<STRING></STRING> setSRst; bool bFirst=true; vector<STRING></STRING>::iterator itTerm = vecTerm.begin(); for ( ; itTerm != vecTerm.end(); ++itTerm ) { setSRst.clear(); copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin())); map mapRstDoc; string docid; int doccnt; map::iterator itBuckets = mapBuckets.find(*itTerm); if (itBuckets != mapBuckets.end()) { string strBucket = (*itBuckets).second; string::size_type idx; idx = strBucket.find_first_not_of(" "); strBucket = strBucket.substr(idx); while ( (idx = strBucket.find(" ")) != string::npos ) { docid = strBucket.substr(0,idx); doccnt = 0; if (docid.empty()) continue; map::iterator it = mapRstDoc.find(docid); if ( it != mapRstDoc.end() ) { doccnt = (*it).second + 1; mapRstDoc.erase(it); } mapRstDoc.insert( pair(docid,doccnt) ); strBucket = strBucket.substr(idx+1); } // remember the last one docid = strBucket; doccnt = 0; map::iterator it = mapRstDoc.find(docid); if ( it != mapRstDoc.end() ) { doccnt = (*it).second + 1; mapRstDoc.erase(it); } mapRstDoc.insert( pair(docid,doccnt) ); } // sort by term frequencty multimap > newRstDoc; map::iterator it0 = mapRstDoc.begin(); for ( ; it0 != mapRstDoc.end(); ++it0 ){ newRstDoc.insert( pair((*it0).second,(*it0).first) ); } multimap::iterator itNewRstDoc = newRstDoc.begin(); setRelevantRst.clear(); for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){ string docid = (*itNewRstDoc).second; if (bFirst==true) { setRelevantRst.insert(docid); continue; } if ( setSRst.find(docid) != setSRst.end() ){ setRelevantRst.insert(docid); } } //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>"; bFirst = false; } return true; } /**
* E序译说明
* 相关性分析查询,构造结果集合setRelevantRst //瓉所?br> *
* @access public
* @param vector map set 参数的汉字说明:(x) 用户提交关键字的分词l,倒排索引映射Q相x结果集?br> * @return string 0
*/
bool CQuery::GetRelevantRst
(
vector &vecTerm,
map &mapBuckets,
set &setRelevantRst
) const
{
set setSRst;
bool bFirst=true;
vector::iterator itTerm = vecTerm.begin();
for ( ; itTerm != vecTerm.end(); ++itTerm )
{
setSRst.clear();
copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));
map mapRstDoc;
string docid;
int doccnt;
map::iterator itBuckets = mapBuckets.find(*itTerm);
if (itBuckets != mapBuckets.end())
{
string strBucket = (*itBuckets).second;
string::size_type idx;
idx = strBucket.find_first_not_of(" ");
strBucket = strBucket.substr(idx);
while ( (idx = strBucket.find(" ")) != string::npos )
{
docid = strBucket.substr(0,idx);
doccnt = 0;
if (docid.empty()) continue;
map::iterator it = mapRstDoc.find(docid);
if ( it != mapRstDoc.end() )
{
doccnt = (*it).second + 1;
mapRstDoc.erase(it);
}
mapRstDoc.insert( pair(docid,doccnt) );
strBucket = strBucket.substr(idx+1);
}
// remember the last one
docid = strBucket;
doccnt = 0;
map::iterator it = mapRstDoc.find(docid);
if ( it != mapRstDoc.end() )
{
doccnt = (*it).second + 1;
mapRstDoc.erase(it);
}
mapRstDoc.insert( pair(docid,doccnt) );
}
// sort by term frequencty
multimap > newRstDoc;
map::iterator it0 = mapRstDoc.begin();
for ( ; it0 != mapRstDoc.end(); ++it0 ){
newRstDoc.insert( pair((*it0).second,(*it0).first) );
}
multimap::iterator itNewRstDoc = newRstDoc.begin();
setRelevantRst.clear();
for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){
string docid = (*itNewRstDoc).second;
if (bFirst==true) {
setRelevantRst.insert(docid);
continue;
}
if ( setSRst.find(docid) != setSRst.end() ){
setRelevantRst.insert(docid);
}
}
//cout << "setRelevantRst.size(): " << setRelevantRst.size() << "";
bFirst = false;
}
return true;
}
接下来的是现实了,前面都只是处理数据得?setRelevantRst q个查询l构集合,q里׃多说了下面就和php之类的脚本语a差不多,格式化结果集合ƈ昄出来?br>//TSESearch.cpp
view plaincopy to clipboardprint?
//下面开始显C?nbsp;
CDisplayRst iDisplayRst;
iDisplayRst.ShowTop();
float used_msec = (end_tv.tv_sec-begin_tv.tv_sec)*1000
+((float)(end_tv.tv_usec-begin_tv.tv_usec))/(float)1000;
iDisplayRst.ShowMiddle(iQuery.m_sQuery,used_msec,
setRelevantRst.size(), iQuery.m_iStart);
iDisplayRst.ShowBelow(vecTerm,setRelevantRst,vecDocIdx,iQuery.m_iStart);
//TSESearch.cpp CQuery iQuery;
iQuery.GetInputs(); //具体E序开始执?br> // current query & result page number
iQuery.SetQuery();
iQuery.SetStart();
// begin to search
//开始具体搜索程?br> gettimeofday(&begin_tv,&tz); //开始计时获取程序运行时间差
iQuery.GetInvLists(mapBuckets); //所有字W集存入映射变量?nbsp;瓉所?br> iQuery.GetDocIdx(vecDocIdx); //倒排索引存入向量?nbsp; 瓉所?br>
CHzSeg iHzSeg; //include ChSeg/HzSeg.h
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //get到的查询变量分词分成 "? ? 你们/ ? 格式"
vector vecTerm;
iQuery.ParseQuery(vecTerm); //以"/"划分开的关键字一一序攑օ一个向量容器中
set setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
gettimeofday(&end_tv,&tz);
// search end
//搜烦(ch)完毕按照序我们首先深入qiQuery对象的类CQuery
//Query.cpp
1、GetInputs
q个Ҏ(gu)的功能是前台getq来的变量{换到HtmlInputsl构体数l中如下例子和代码:(x)
//假设前台查询的关键字?1"着HtmlInputs中内容输出如?nbsp; //HtmlInputs[0].Name word //HtmlInputs[0].Value 1 //HtmlInputs[1].Name www //HtmlInputs[1].Value 搜烦(ch) //HtmlInputs[2].Name cdtype //HtmlInputs[2].Value GB
/*
* Get form information throught environment varible.
* return 0 if succeed, otherwise exit.
*/
/**
* E序译说明
* 处理GETq来的表?br> *
* @access public
* @return string 0
*/
int CQuery::GetInputs()
{
int i,j;
char *mode = getenv("REQUEST_METHOD"); //q回环境变量的?q里环境变量 REQUEST_METHOD ?get Ҏ(gu)
char *tempstr; //GET变量字符串或POST字符串内?br> char *in_line;
int length; //GET变量串长度或POST内容长度
cout << "Content-type: text/html\n\n";
//cout << "Cache-Control: no-cache\n";
//cout << "Expires: Tue, 08 Apr 1997 17:20:00 GMT\n";
//cout << "Expires: 0\n";
//cout << "Pragma: no-cache\n\n";
cout << "\n";
cout << "\n";
//cout << "\n";
//cout << "\n";
//cout << "\n";
cout << "\n";
cout.flush(); //释放输出~冲?输出头部head和之前的html标签内容
//cout <<"" << endl;
if (mode==NULL) return 1;
if (strcmp(mode, "POST") == 0)
{
length = atoi(getenv("CONTENT_LENGTH")); //如果是POSTҎ(gu)着获得环境变量CONTENT_LENGTH的整型?br> if (length==0 || length>=256)
return 1;
in_line = (char*)malloc(length + 1);
read(STDIN_FILENO, in_line, length);
in_line[length]='\0';
}
else if (strcmp(mode, "GET") == 0)
{
char* inputstr = getenv("QUERY_STRING"); //如果是GETҎ(gu)着获得环境变量QUERY_STRING的字W串?br> length = strlen(inputstr);
if (inputstr==0 || length>=256)
return 1;
//获取get内容长度q把get Q后面的参数赋值给变量in_line
in_line = (char*)malloc(length + 1);
strcpy(in_line, inputstr); //心溢出d
}
tempstr = (char*)malloc(length + 1); //获取post内容或get内容长度
if(tempstr == NULL)
{
printf("\n");
printf("\n");
printf("Major failure #1;please notify the webmaster\n");
printf("\n");
fflush(stdout); //输出~冲?br> exit(2); //错误q回
}
j=0;
for (i=0; i char
strcpy(HtmlInputs[HtmlInputCount].Name,tempstr);
if (i == length - 1)
{
strcpy(HtmlInputs[HtmlInputCount].Value,"");
HtmlInputCount++;
}
j=0;
}
else if ((in_line[i] == '&') || (i==length-1))
{
if (i==length-1)
{
if(in_line[i] == '+')tempstr[j]=' ';
else tempstr[j] = in_line[i];
j++;
}
tempstr[j]='\0';
CStrFun::Translate(tempstr); //URL~码形式的参数{换成字符?%** -> char
strcpy(HtmlInputs[HtmlInputCount].Value,tempstr);
HtmlInputCount++;
j=0;
}
else if (in_line[i] == '+')
{
tempstr[j]=' ';
j++;
}
else
{
tempstr[j]=in_line[i]; //l合get中的变量如word www cdtype
j++;
}
//cout<";
//cout<";
//cout.flush();
}
/*
for (int kk = 0; kk < HtmlInputCount ; ++kk )
{
cout<<"Name="<";
cout<<"Value="<";
}
//假设前台查询的关键字?1"输出如下
//Name=word
//Value=1
//Name=www
//Value= 搜烦(ch)
//Name=cdtype
//Value=GB
*/
if(in_line) free(in_line);
if(tempstr) free(tempstr);
return 0;
}
2、SetQuery
//Query.cpp
void CQuery::SetQuery()
{
string q = HtmlInputs[0].Value;
CStrFun::Str2Lower(q,q.size()); //大写变小?br> m_sQuery = q; //准备查询关键?br>}
3、SetStart
void CQuery::SetQuery()
{
string q = HtmlInputs[0].Value;
CStrFun::Str2Lower(q,q.size()); //大写变小写word变量里的?br> m_sQuery = q; //讄查询关键?br>}
4、GetInvLists
bool CQuery::GetInvLists(map<string, string> &mapBuckets) const
{
ifstream ifsInvInfo(INF_INFO_NAME.c_str(), ios::binary); //以二q制形式打开一个文件的输入缓ԌINF_INFO_NAME在头文gComm.h中定义了的, const string INF_INFO_NAME("./Data/sun.iidx");
//倒排索引文g索引字和文档好之间有一个制表符"\t"
//朱d 14383 16151 16151 16151 1683 207 6302 7889 8218 8218 8637
//朱古?nbsp; 1085 1222
if (!ifsInvInfo) {
cerr << "Cannot open " << INF_INFO_NAME << " for input\n";
return false;
}
string strLine, strWord, strDocNum;
//以行d输入缓冲到字符串对象strLine中ƈ做处?br> while (getline(ifsInvInfo, strLine)) {
string::size_type idx;
string tmp;
idx = strLine.find("\t");
strWord = strLine.substr(0,idx);
strDocNum = strLine.substr(idx+1);
mapBuckets.insert(map<string,string>::value_type (strWord, strDocNum)); //倒排表二二l表存入映射?br>
/*
map<string, string>::iterator iter;
int kkk = 0;
for (iter = mapBuckets.begin(); kkk != 10; ++iter)
{
cout<<iter->first<<" "<<iter->second<<"<br>";
++kkk;
}
cout.flush();
*/
}
return true;
}
5、GetDocIdx
bool CQuery::GetDocIdx(vector &vecDocIdx) const
{
ifstream ifs(DOC_IDX_NAME.c_str(), ios::binary);
//0 0 bc9ce846d7987c4534f53d423380ba70
//1 76760 4f47a3cad91f7d35f4bb6b2a638420e5
//2 141624 d019433008538f65329ae8e39b86026c
if (!ifs) {
cerr << "Cannot open " << DOC_IDX_NAME << " for input\n"; //以二q制形式打开一个文件的输入缓ԌDOC_IDX_NAME在头文gComm.h中定义了的, const string INF_INFO_NAME("./Data/Doc.idx");
return false;
}
string strLine, strDocid, strUrl;
while (getline(ifs,strLine)){
DocIdx di;
sscanf( strLine.c_str(), "%d%d", &di.docid, &di.offset ); //只保留了前面两项文档号和偏移?br> vecDocIdx.push_back(di); //导入l构体向量中
}
return true;
}
本着黑客_我将陆箋把最q分析注释TSE搜烦(ch)引擎的心得发布出来,老鸟Q大虾,大牛Q高手飘q就是了Q若愿意费指点下小弟的在下不甚感激Q有问题的朋友直接留a讨论。由于本人水qx限,分析和翻译难免有错大家见W了?/p>
上学期拜MJames F.Kurose著的《计机|络-自顶向下Ҏ(gu)与internet特色(W三版阴?》,觉得写得实不错(希望没看的朋友一定要买来看看)Q自׃来搞个高自顶向下的学?fn)方法,先从用户看得到的东西出发分析研究搜?ch)引擎Q下面我们就来看看各大搜索引擎搜索界面的代码Q你所需要特别注意的是form表单中的action
雅虎http://www.yohoo.com/Q?/p>
<form name=s1 style="margin-bottom:0" action="<table cellpadding=0 cellspacing=0 border=0><tr><td>
<input type=text size=30 name=p title="enter search terms here">
<input type=submit value=Search> </td><td><font face=arial size=-2>·
<a href=" search</a><br>·
<a href=" popular</a></font></td></tr></table></form>
hhttp://www.g.cnQ?/p>
<form method=GET action=/search><tr><td nowrap>
<font size=-1><input type=text name=q size=41 maxlength=2048 value="jrckkyy" title="Google 搜烦(ch)"> <input type=submit name=btnG value="Google 搜烦(ch)"><input type=hidden name=complete value=1><input type=hidden name=hl value="zh-CN"><input type=hidden name=newwindow value=1><input type=hidden name=sa value="2"></font></td></tr></form>
癑ֺhttp://www.baidu.comQ?/p>
<form name=f2 action="/s">
<tr valign="middle">
<td nowrap>
<input type=hidden name=ct value="0">
<input type=hidden name=ie value="gb2312">
<input type=hidden name=bs value="jrckkyy">
<input type=hidden name=sr>
<input type=hidden name=z value="">
<input type=hidden name=cl value=3>
<input type=hidden name=f value=8>
<input name=wd size="35" class=i value="jrckkyy" maxlength=100>
<input type=submit value=癑ֺ一?gt; <input type=button value=l果中找 onclick="return bq(f2,1,0);"> </td>
<td nowrap><a href="</tr>
</form>
天网http://www.tianwang.com/Q?/p>
<form name=f action="/cgi-bin/tw" method=get>
<td valign=center width=634 background=images/index_image_02.gif>
<table height=46 cellspacing=0 cellpadding=0 width=600 align=right border=0>
<tbody>
<tr>
<td height=50>
<table cellspacing=0 cellpadding=0 width=600 border=0>
<tbody>
<tr>
<td width="524" height="30" valign="bottom">
<div align="center"> <input name="word" type="text" size="40" maxlength="255" onClick="this.focus();checkWord(this,1)" onblutesr='checkWord(this,0)' value='误入资源名U?>
<font color=#ffffff>
<select onChange=reRange(this.selectedIndex) name=range>
<script language=javascript>...
<!--
for(var i = 0; i < rescode.length; i++) ...{
if(i == 0) ...{
document.write('<option value="0" selected>' + rescode[i][0] + '</option>');
} else ...{
document.write('<option value="' + i + '">' + rescode[i][0] + '</option>');
}
}
document.f.range.selectedIndex = 0;
-->
</script>
</select>
</font>-<font color=#ffffff>
<select name=cd>
<script language=javascript>...
<!--
var ind = document.f.range.selectedIndex;
var len = (rescode[ind].length - 1) / 2;
var sel = 0;
for(var i = 0; i < len; i++) ...{
document.write('<option value="' + rescode[ind][2*i+1] + '">' + rescode[ind][2*i+2] + '</option>');
if(rescode[ind][2*i+1] == 0)
sel = i;
}
document.f.cd.selectedIndex = sel;
-->
</script>
</select>
</font></div>
</td>
<td width="71" valign="bottom"><input id=submit2 type=image height=22 width=40 src="images/so2.gif" align=absMiddle name=submit></td>
</tr>
<tr>
<td colspan=3 height=25 class=style16>
<div align=center></div>
</td>
</tr>
</tbody>
</table>
</td>
</tr>
</tbody>
</table>
</td>
</form>
试服务器TSEQ?/p>
<form method="get" action="/cgi-bin/index/TSESearch" name="tw">
<td width="100%" height="25" align="center">
<input type="text" name="word" size="55">
<input type="submit" value=" 搜烦(ch)" name="www">
</td>
<input type="hidden" name="cdtype" value="GB">
</form>
׃上几个form的属性可以看出全部采用的是getҎ(gu)QCGI做ؓ(f)处理E序Q也是C/C++QCGI全称?#8220;公共|关界面”(Common Gateway Interface)QHTTP服务器与你的或其它机器上的程序进?#8220;交谈”的一U工P其程序须q行在网l服务器上。CGI逐渐被近几年来的PHPQJAVAQASPQPERLQPythonQRuby{动态语a所取代。但是其在速度和运行效率上的优势是无法取代的?/p>
以下是TSE CGI入口E序注释Q其他搜索引擎的入口也应该类?/p>
/**//**
* E序译说明
* @Copyright (c) 2008, 研发?br> * All rights reserved.
*
* @filesource TSESearch.cpp
* @author jrckkyy <jrckkyy@163.com>
*
* Let's start
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/time.h>
#include <unistd.h>
#include <iostream>
#include <fstream>
#include <list>
#include "Comm.h" //包含2个烦(ch)引和1个数据文?br>#include "Query.h" //包含数据查询处理头文?br>#include "Document.h" //html文档处理头文?br>#include "StrFun.h" //字符串处理头文g
#include "ChSeg/Dict.h" //字元字典处理头文?br>#include "ChSeg/HzSeg.h"
#include "DisplayRst.h" //q回查询l果面头文Ӟq回l果分ؓ(f)头部Q中部,底部
using namespace std;
/**//*
* A inverted file(INF) includes a term-index file & a inverted-lists file.
* A inverted-lists consists of many bucks(posting lists).
* The term-index file is stored at vecTerm, and
* the inverted-lists is sored at mapBuckets.
*/
/**//**
* E序译说明
* 搜烦(ch)E序入口前台关键字提交到该cgiE序 例如Q?/cgi-bin/index/TSESearch?word=123&start=1
* 倒排文g包括一个记录检索词文g和一个倒排列表文g?br> * 倒排列表包含很多标志Q提交名单)?br> * 记录(g)索词文g使用vecTerm来排序,和倒排列表是用mapBuckets来排序?br> *
* @access public
* @param int char 参数的汉字说?用于接收前台get传递的参数
* @return string 0
*/
int main(int argc, char* argv[])
...{
struct timeval begin_tv, end_tv;
struct timezone tz;
CDict iDict;
map<string, string> dictMap, mapBuckets;
vector<DocIdx> vecDocIdx; //Document。h
CQuery iQuery;
iQuery.GetInputs(); //具体E序开始执?br> // current query & result page number
iQuery.SetQuery();
iQuery.SetStart();
// begin to search
//开始具体搜索程?br> gettimeofday(&begin_tv,&tz); //开始计时获取程序运行时间差
iQuery.GetInvLists(mapBuckets); //所有字W集存入映射变量?nbsp; 瓉所?br> iQuery.GetDocIdx(vecDocIdx); //倒排索引存入向量?nbsp; 瓉所?br>
CHzSeg iHzSeg; //include ChSeg/HzSeg.h
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //get到的查询变量分词分成 "? ? 你们/ ? 格式"
vector<string> vecTerm;
iQuery.ParseQuery(vecTerm); //以"/"划分开的关键字一一序攑օ一个向量容器中
set<string> setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
gettimeofday(&end_tv,&tz);
// search end
//搜烦(ch)完毕
//下面开始显C?br> CDisplayRst iDisplayRst;
iDisplayRst.ShowTop();
float used_msec = (end_tv.tv_sec-begin_tv.tv_sec)*1000
+((float)(end_tv.tv_usec-begin_tv.tv_usec))/(float)1000;
iDisplayRst.ShowMiddle(iQuery.m_sQuery,used_msec,
setRelevantRst.size(), iQuery.m_iStart);
iDisplayRst.ShowBelow(vecTerm,setRelevantRst,vecDocIdx,iQuery.m_iStart);
return 0;
}