香蕉免费一区二区三区在线观看,国产亚洲精品美女,久久精品久久综合

[分布式跨�q�_��监控�pȝ��]linux下监控网�l�流量和�|��?python脚本应用

Mon, 15 Mar 2010 11:22:00 GMT

�׃��上证所

独立的监控脚

�q�行以下脚

代码�Q?/p>

' netcard[key] = netcard[key] + item tempcount = 1 for match in if tempcount == 1: RXold[key] = match.group(2) tempcount = tempcount + 1 elif tempcount == 2: TXold[key] = match.group(2) netcard[key] #记录当前�|�卡信息��C��时文件中 os.system('ifconfig > ifconfigtemp') file = open('./ifconfigtemp','r'); fileold = open('./oldifconfig', 'w') temptimestr = str(int(time.time())); fileold.write(temptimestr) fileold.write('#') file.seek(0) fileold.write(file.read()) fileold.close() returnkeys = [] returnvalues = [] netcard = {} tempcountcard = 0 file.seek(0) key = '' for strline in file.readlines(): reobj = re.compile('^lo*.') if reobj.search(strline): break; reobj = re.compile('^eth*.') if reobj.search(strline): key = strline.split()[0] netcard[key] = '' netcard[key] = netcard[key] + strline newnetcard = {} file.seek(0) key = '' for strline in file.readlines(): reobj = re.compile('^lo*.') if reobj.search(strline): break; if re.search("^eth", strline): templist = strline.split() key = templist[0] newnetcard[key] = '' newnetcard[key] if re.search("^ *inet ", strline): templist = strline.split() newnetcard[key] for key,value in newnetcard.items(): #记录每张�|�卡是否工作状态信息到临时文�g os.system('ethtool file = open('./ethtooltemp','r'); tempethtooltemplist file.close if re.search("yes", templist = newnetcard[key].split() newnetcard[key] else: templist = newnetcard[key].split() if len(templist) > 1: newnetcard[key] else: newnetcard[key] = file.close() RX = {} TX = {} for key,value in netcard.items(): tempsplit = value.split('\n') netcard[key] = '' for item in tempsplit: item = item + '<br>' netcard[key] = netcard[key] + item tempcount = 1 for match in if tempcount == 1: RX[key] = str(int(match.group(2)) tempcount = tempcount + 1 elif tempcount == 2: TX[key] = str(int(match.group(2)) divtime = float(int(time.time()) if divtime == 0: rate = (float(TX[key]) else: rate = (float(TX[key]) if rate == 0: newnetcard[key] else: newnetcard[key] return zip(['order'], else: os.system('ifconfig > ifconfigtemp') file = open('./ifconfigtemp','r'); fileold = open('./oldifconfig', 'w') temptimestr = str(int(time.time())); fileold.write(temptimestr) fileold.write('#') file.seek(0) fileold.write(file.read()) fileold.close() netcard = {} file.seek(0) key = '' for strline in file.readlines(): reobj = re.compile('^lo*.') if reobj.search(strline): break; reobj = re.compile('^eth*.') if reobj.search(strline): key = strline.split()[0] netcard[key] = '' netcard[key] = netcard[key] + strline RX = {} TX = {} key = '' newnetcard = {} file.seek(0) for strline in file.readlines(): reobj = re.compile('^lo*.') if reobj.search(strline): break; if re.search("^eth", strline): templist = strline.split() key = templist[0] newnetcard[key] = templist[4] + ' ' if re.search("^ *inet ", strline): templist = strline.split() newnetcard[key] for key,value in newnetcard.items(): os.system('ethtool file = open('./ethtooltemp','r'); tempethtooltemplist file.close if re.search("yes", newnetcard[key] else: newnetcard[key] file.close() for key,value in netcard.items(): tempsplit = value.split('\n') netcard[key] = '' for item in tempsplit: item = item + '<br>' #print item netcard[key] = netcard[key] + item tempcount = 1 for match in if tempcount == 1: RX[key] = match.group(2) tempcount = tempcount + 1 elif tempcount == 2: TX[key] = match.group(2) netcard[key] newnetcard[key] return zip(['order'], if __name__ == '__main__': print run()

�Q�深交所level1�Q�level2金融数据服务器在上午9�Q?0开始到11�Q?0和下�?3�Q?0开始到15�Q?0一共大�U?个小时的旉��内流量比较大所以被监控服务器的�|�络��速算是一个被监控的重要指标。可以通过累加一�D�|��间内各个�|�卡的上行，下行��量除以�q�个旉��间隔计算��D�|��间内的��^均网速，我现在的采集频率�?分钟采集一�ơ，在实际开盘期间运行过�E�中得到的网速监控信息用�q�是比较准确的，都保持在5M/S左右的速度�Q�有时候在�q�x��非服务期看见某台服务器的内网�|�卡�|�速达�?M/S �Q�果然就是有人在大手�W�传输�?/p> 本是�q�回一个列表嵌套元�l�的数据�l�构�Q�最后再汇��L��一个完整的XML数据岛，��Z��调试方便脚本的每一个中间结果都导出��C��个��时文本中�?/p> 本要确定你的linux装了ethtool工具�Q�在ubuntu2.6.27-7-server�Q�ubuntu22.6.27.19-5-default�Q�suse 2.6.27.19-5-default ��试通过�?/p> class=dp-highlighter> class=tools>view plain copy to clipboard print ?

class=alt>#coding=utf-8    class="">#!/usr/bin/python    class=alt>import re    class="">import os    class=alt>import time    class="">   class=alt>import utils    class="">def sortedDictValues3(adict):    class=alt>    keys = adict.keys()    class="">    keys.sort()    class=alt>    return map(adict.get, keys)    class="">   class=alt>def run():    class="">    if utils.isLinux() == False:    class=alt>        return [('ifconfig_collect os type error','this is windows')]    class="">    #not first run    class=alt>    if os.path.isfile('./oldifconfig'):    class="">        fileold = open('./oldifconfig', 'r')    class=alt>        fileold.seek(0)    class="">        #��d��上次记录的��时流量数据文�Ӟ��和时间戳    class=alt>        (oldtime, fileoldcontent) = fileold.read().split('#')    class="">        fileold.close;    class=alt>        netcard = {}    class="">        tempstr = ''   class=alt>        key = ''   class="">        for strline in fileoldcontent.split('\n'):    class=alt>            reobj = re.compile('^lo*.')    class="">            if reobj.search(strline):    class=alt>                break;    class="">            reobj = re.compile('^eth*.')    class=alt>            if reobj.search(strline):    class="">                key = strline.split()[0]    class=alt>            tempstr = tempstr + strline + '\n'   class="">            netcard[key] = tempstr    class=alt>        RXold = {}    class="">        TXold = {}    class=alt>        for key,value in netcard.items():    class="">            tempsplit = value.split('\n')    class=alt>            netcard[key] = ''   class="">            for item in tempsplit:    class=alt>                item = item + '
'   class="">                netcard[key] = netcard[key] + item    class=alt>                tempcount = 1   class="">                for match in re.finditer("(bytes:)(.*?)( \()", item):    class=alt>                    if tempcount == 1:    class="">                        RXold[key] = match.group(2)    class=alt>                        tempcount = tempcount + 1   class="">                    elif tempcount == 2:    class=alt>                        TXold[key] = match.group(2)    class="">                        netcard[key] = netcard[key] + 'net io percent(bytes/s): 0
'   class=alt>            class="">        #记录当前�|�卡信息��C��时文件中    class=alt>        os.system('ifconfig > ifconfigtemp')    class="">        file = open('./ifconfigtemp','r');    class=alt>        fileold = open('./oldifconfig', 'w')    class="">        temptimestr = str(int(time.time()));    class=alt>        fileold.write(temptimestr)    class="">        fileold.write('#')    class=alt>        file.seek(0)    class="">        fileold.write(file.read())    class=alt>        fileold.close()    class="">        returnkeys = []    class=alt>        returnvalues = []    class="">        netcard = {}    class=alt>        tempcountcard = 0   class="">        file.seek(0)    class=alt>        key = ''   class="">        for strline in file.readlines():    class=alt>            reobj = re.compile('^lo*.')    class="">            if reobj.search(strline):    class=alt>                break;    class="">            reobj = re.compile('^eth*.')    class=alt>            if reobj.search(strline):    class="">                key = strline.split()[0]    class=alt>                netcard[key] = ''   class="">            netcard[key] = netcard[key] + strline    class=alt>        newnetcard = {}    class="">        file.seek(0)    class=alt>        key = ''   class="">        for strline in file.readlines():    class=alt>            reobj = re.compile('^lo*.')    class="">            if reobj.search(strline):    class=alt>                break;    class="">            if re.search("^eth", strline):    class=alt>                templist = strline.split()    class="">                key = templist[0]    class=alt>                newnetcard[key] = ''   class="">                newnetcard[key] = templist[4] + newnetcard[key] + ' '   class=alt>            if re.search("^ *inet ", strline):    class="">                templist = strline.split()    class=alt>                newnetcard[key] = templist[1][5:] + ' ' + newnetcard[key] + ' '   class="">        for key,value in newnetcard.items():    class=alt>            #记录每张�|�卡是否工作状态信息到临时文�g    class="">            os.system('ethtool %s > ethtooltemp'%(key))    class=alt>            file = open('./ethtooltemp','r');    class="">            tempethtooltemplist = file.read().split('\n\t')    class=alt>            file.close    class="">            if re.search("yes", tempethtooltemplist[-1]):    class=alt>                templist = newnetcard[key].split()    class="">                newnetcard[key] = templist[0] + ' runing! ' + templist[1]    class=alt>            else:    class="">                templist = newnetcard[key].split()    class=alt>                if len(templist) > 1:    class="">                    newnetcard[key] = templist[0] + ' stop! ' + templist[1]    class=alt>                else:    class="">                    newnetcard[key] =  'stop! ' + templist[0]    class=alt>        file.close()    class="">        RX = {}    class=alt>        TX = {}    class="">        for key,value in netcard.items():    class=alt>            tempsplit = value.split('\n')    class="">            netcard[key] = ''   class=alt>            for item in tempsplit:    class="">                item = item + '
'   class=alt>                netcard[key] = netcard[key] + item    class="">                tempcount = 1   class=alt>                for match in re.finditer("(bytes:)(.*?)( \()", item):    class="">                    if tempcount == 1:    class=alt>                        RX[key] = str(int(match.group(2)) - int(RXold[key]))    class="">                        tempcount = tempcount + 1   class=alt>                    elif tempcount == 2:    class="">                        TX[key] = str(int(match.group(2)) - int(TXold[key]))    class=alt>                        divtime = float(int(time.time()) - int(oldtime))    class="">                        if divtime == 0:    class=alt>                            rate = (float(TX[key]) + float(RX[key]))    class="">                        else:    class=alt>                            rate = (float(TX[key]) + float(RX[key]))/(divtime)    class="">                        if rate == 0:    class=alt>                            newnetcard[key] = '0' + ' ' + newnetcard[key]    class="">                        else:    class=alt>                            newnetcard[key] = '%.2f'%rate + ' ' + newnetcard[key]    class="">        return zip(['order'], ['48']) + newnetcard.items();    class=alt>    else:    class="">        os.system('ifconfig > ifconfigtemp')    class=alt>        file = open('./ifconfigtemp','r');    class="">        fileold = open('./oldifconfig', 'w')    class=alt>        temptimestr = str(int(time.time()));    class="">        fileold.write(temptimestr)    class=alt>        fileold.write('#')    class="">        file.seek(0)    class=alt>        fileold.write(file.read())    class="">        fileold.close()    class=alt>   class="">        netcard = {}    class=alt>        file.seek(0)    class="">        key = ''   class=alt>        for strline in file.readlines():    class="">            reobj = re.compile('^lo*.')    class=alt>            if reobj.search(strline):    class="">                break;    class=alt>            reobj = re.compile('^eth*.')    class="">            if reobj.search(strline):    class=alt>                key = strline.split()[0]    class="">                netcard[key] = ''   class=alt>            netcard[key] = netcard[key] + strline    class="">        RX = {}    class=alt>        TX = {}    class="">            class=alt>        key = ''   class="">        newnetcard = {}    class=alt>        file.seek(0)    class="">        for strline in file.readlines():    class=alt>            reobj = re.compile('^lo*.')    class="">            if reobj.search(strline):    class=alt>                break;    class="">            if re.search("^eth", strline):    class=alt>                templist = strline.split()    class="">                key = templist[0]    class=alt>                newnetcard[key] = templist[4] + ' '   class="">            if re.search("^ *inet ", strline):    class=alt>                templist = strline.split()    class="">                newnetcard[key] = newnetcard[key] + templist[1][5:] + ' '   class=alt>        for key,value in newnetcard.items():    class="">            os.system('ethtool %s > ethtooltemp'%(key))    class=alt>            file = open('./ethtooltemp','r');    class="">            tempethtooltemplist = file.read().split('\n')    class=alt>            file.close    class="">            if re.search("yes", tempethtooltemplist[-1]):    class=alt>                newnetcard[key] = newnetcard[key] + 'runing!'   class="">            else:    class=alt>                newnetcard[key] = newnetcard[key] + 'stop!'   class="">        file.close()    class=alt>        for key,value in netcard.items():    class="">            tempsplit = value.split('\n')    class=alt>            netcard[key] = ''   class="">            for item in tempsplit:    class=alt>                item = item + '
'   class="">                #print item    class=alt>                netcard[key] = netcard[key] + item    class="">                tempcount = 1   class=alt>                for match in re.finditer("(bytes:)(.*?)( \()", item):    class="">                    if tempcount == 1:    class=alt>                        RX[key] = match.group(2)    class="">                        tempcount = tempcount + 1   class=alt>                    elif tempcount == 2:    class="">                        TX[key] = match.group(2)    class=alt>                        netcard[key] = netcard[key] + 'net io percent(bytes/s): 0
'   class="">                        newnetcard[key] = newnetcard[key] + ' ' + '0
'   class=alt>        return zip(['order'], ['48']) + newnetcard.items();    class="">if __name__ == '__main__':    class=alt>    print run()   style="DISPLAY: none" name=code rows=15 cols=50>#coding=utf-8 os type error','this is windows')] = fileold.read().split('#') in fileoldcontent.split('\n'): re.finditer("(bytes:)(.*?)( \()", item): = netcard[key] + 'net io percent(bytes/s): 0
' = templist[4] + newnetcard[key] + ' ' = templist[1][5:] + ' ' + newnetcard[key] + ' ' %s > ethtooltemp'%(key)) = file.read().split('\n\t') tempethtooltemplist[-1]): = templist[0] + ' runing! ' + templist[1] = templist[0] + ' stop! ' + templist[1] 'stop! ' + templist[0] re.finditer("(bytes:)(.*?)( \()", item): - int(RXold[key])) - int(TXold[key])) - int(oldtime)) + float(RX[key])) + float(RX[key]))/(divtime) = '0' + ' ' + newnetcard[key] = '%.2f'%rate + ' ' + newnetcard[key] ['48']) + newnetcard.items(); = newnetcard[key] + templist[1][5:] + ' ' %s > ethtooltemp'%(key)) = file.read().split('\n') tempethtooltemplist[-1]): = newnetcard[key] + 'runing!' = newnetcard[key] + 'stop!' re.finditer("(bytes:)(.*?)( \()", item): = netcard[key] + 'net io percent(bytes/s): 0
' = newnetcard[key] + ' ' + '0
' ['48']) + newnetcard.items();

使用例子�Q?/p>

每一个列表元素元�l�里面第二个元素�W�一个字�D��ؓ�|��?Bytes/S�Q�例如eth1�|�卡的网速就�?.3KB/s�Q�eth0�|�速是2.9KB/s�Q�今天是周六�q�个��量很正�?/p>

学者站在巨人的肩膀�?/a> 2010-03-15 19:22 发表评论

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析及完全注释[6]倒排索引的徏立的�E�序分析(4)

Thu, 10 Dec 2009 15:03:00 GMT

以下是根据正向烦引徏立倒排索引的注�?/p>

int main(int argc, char* argv[])    //./CrtInvertedIdx moon.fidx.sort > sun.iidx
{
    ifstream ifsImgInfo(argv[1]);
    if (!ifsImgInfo)
    {
        cerr << "Cannot open " << argv[1] << " for input\n";
        return -1;
    }

    string strLine,strDocNum,tmp1="";
    int cnt = 0;
    while (getline(ifsImgInfo, strLine))
    {
        string::size_type idx;
        string tmp;

idx = strLine.find("\t");
tmp = strLine.substr(0,idx);

if (tmp.size()<2 || tmp.size() > 8) continue;

if (tmp1.empty()) tmp1=tmp;

        if (tmp == tmp1)
        {
            strDocNum = strDocNum + " " + strLine.substr(idx+1);
        }
        else
        {
            if ( strDocNum.empty() )
                strDocNum = strDocNum + " " + strLine.substr(idx+1);

            cout << tmp1 << "\t" << strDocNum << endl;
            tmp1 = tmp;
            strDocNum.clear();
            strDocNum = strDocNum + " " + strLine.substr(idx+1);
        }

        cnt++;
        //if (cnt==100) break;
    }
    cout << tmp1 << "\t" << strDocNum << endl; //倒排索引中每个字典单词后的文��编号以table键�ؓ间隔

return 0;
}

学者站在巨人的肩膀�?/a> 2009-12-10 23:03 发表评论

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析及完全注释[6]倒排索引的徏立的�E�序分析(3)

Thu, 10 Dec 2009 15:02:00 GMT

�q�里介绍正向索引的徏立，如果直接建立倒排索引效率上可能会很低�Q�所以可以先产生正向索引为后面的倒排索引打下基础�?/p>

详细的文件功能和介绍都在�q�里有了介绍自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析及完全注释[5]倒排索引的徏立及文�g介绍

CrtForwardIdx.cpp文�g

int main(int argc, char* argv[])    //./CrtForwardIdx Tianwang.raw.***.seg > moon.fidx
{
    ifstream ifsImgInfo(argv[1]);
    if (!ifsImgInfo)
    {
        cerr << "Cannot open " << argv[1] << " for input\n";
        return -1;
    }

    string strLine,strDocNum;
    int cnt = 0;
    while (getline(ifsImgInfo, strLine))
    {
        string::size_type idx;

        cnt++;
        if (cnt%2 == 1) //奇数行�ؓ文��~�号
        {
            strDocNum = strLine.substr(0,strLine.size());
            continue;
        }
        if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
        {
            continue;
        }

        while ( (idx = strLine.find(SEPARATOR)) != string::npos ) //指定查找分界�W?
        {
            string tmp1 = strLine.substr(0,idx);
            cout << tmp1 << "\t" << strDocNum << endl;
            strLine = strLine.substr(idx + SEPARATOR.size());
        }

//if (cnt==100) break;
}

return 0;
}

author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

学者站在巨人的肩膀�?/a> 2009-12-10 23:02 发表评论

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析及完全注释[6]倒排索引的徏立的�E�序分析(2)

Thu, 10 Dec 2009 15:02:00 GMT

前面的DocIndex�E�序输入一个Tianwang.raw.*****文�g�Q�会产生一下三个文�?Doc.idx, Url.idx, DocId2Url.idx�Q�我们这里对DocSegment�E�序�q�行分析�?/p>

�q�里输入 Tianwang.raw.*****�Q�Doc.idx�Q�Url.idx.sort_uniq�{�三个文�Ӟ��输出一个Tianwang.raw.***.seg 分词完毕的文�?/p>

int main(int argc, char* argv[])
{
    string strLine, strFileName=argv[1];
    CUrl iUrl;
    vector vecCUrl;
    CDocument iDocument;
    vector vecCDocument;
    unsigned int docId = 0;

    //ifstream ifs("Tianwang.raw.2559638448");
    ifstream ifs(strFileName.c_str()); //DocSegment Tianwang.raw.****
    if (!ifs)
    {
        cerr << "Cannot open tianwang.img.info for input\n";
        return -1;
    }

    ifstream ifsUrl("Url.idx.sort_uniq");   //排序�q�消重后的url字典
    if (!ifsUrl)
    {
        cerr << "Cannot open Url.idx.sort_uniq for input\n";
        return -1;
    }
    ifstream ifsDoc("Doc.idx"); //字典文�g
    if (!ifsDoc)
    {
        cerr << "Cannot open Doc.idx for input\n";
        return -1;
    }

    while (getline(ifsUrl,strLine)) //偏离url字典存入一个向量内存中
    {
        char chksum[33];
        int docid;

        memset(chksum, 0, 33);
        sscanf( strLine.c_str(), "%s%d", chksum, &docid );
        iUrl.m_sChecksum = chksum;
        iUrl.m_nDocId = docid;
        vecCUrl.push_back(iUrl);
    }

    while (getline(ifsDoc,strLine))     //偏离字典文�g��其攑օ�一个向量内存中
    {
        int docid,pos,length;
        char chksum[33];

        memset(chksum, 0, 33);
        sscanf( strLine.c_str(), "%d%d%d%s", &docid, &pos, &length,chksum );
        iDocument.m_nDocId = docid;
        iDocument.m_nPos = pos;
        iDocument.m_nLength = length;
        iDocument.m_sChecksum = chksum;
        vecCDocument.push_back(iDocument);
    }

    strFileName += ".seg";
    ofstream fout(strFileName.c_str(), ios::in|ios::out|ios::trunc|ios::binary);    //讄��完成分词后的数据输出文�g
    for ( docId=0; docId    {

        // find document according to docId
        int length = vecCDocument[docId+1].m_nPos - vecCDocument[docId].m_nPos -1;
        char *pContent = new char[length+1];
        memset(pContent, 0, length+1);
        ifs.seekg(vecCDocument[docId].m_nPos);
        ifs.read(pContent, length);

char *s;
s = pContent;

        // skip Head
        int bytesRead = 0,newlines = 0;
        while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1)
        {
            if (*s == '\n')
                newlines++;
            else
                newlines = 0;
            s++;
            bytesRead++;
        }
        if (bytesRead == HEADER_BUF_SIZE-1) continue;

        // skip header
        bytesRead = 0,newlines = 0;
        while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1)
        {
            if (*s == '\n')
                newlines++;
            else
                newlines = 0;
            s++;
            bytesRead++;
        }
        if (bytesRead == HEADER_BUF_SIZE-1) continue;

        //iDocument.m_sBody = s;
        iDocument.RemoveTags(s);    //去除<>
        iDocument.m_sBodyNoTags = s;

delete[] pContent;
string strLine = iDocument.m_sBodyNoTags;

CStrFun::ReplaceStr(strLine, " ", " ");
CStrFun::EmptyStr(strLine); // set " \t\r\n" to " "

        // segment the document 具体分词处理
        CHzSeg iHzSeg;
        strLine = iHzSeg.SegmentSentenceMM(iDict,strLine);
        fout << docId << endl << strLine;
        fout << endl;

    }

return(0);
}
�q�里只是��光掠媄式的�q�一遍大概的代码�Q�后面我会有专题详细讲解 parse html �?segment docment �{�技�?/p>

学者站在巨人的肩膀�?/a> 2009-12-10 23:02 发表评论

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析及完全注释[6]倒排索引的徏立的�E�序分析(1)

Thu, 10 Dec 2009 15:00:00 GMT

author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

上一��主要介�l�了倒排索引建立相关的文件及中间文�g�?br>TSE建立索引在运行程序上的大致步骤可以简化分��Z��下几步：

1、运行命�?./DocIndex
会用��C��个文�?tianwang.raw.520 //爬取回来的原始文�Ӟ��包含多个�|�页的所有信息，所以很大，�q�也是一个有待解决的问题�Q�到底存成大文�g�Q�如果过大会��过2G�?G的限�Ӟ��而且文�g�q�大索引效率�q�低�Q�还是小文�g�Q�文件数�q�多用于打开关闭文�g句柄的消耗过大）�q�有待思考，�q�就是存储方案的解决最�l�肯定是要存为分布式的，最�l��L��仉��肯定是会上TB的，TSE只支持小型的搜烦引擎需求�?nbsp;
会��生一下三个文�?Doc.idx, Url.idx, DocId2Url.idx //Data文�g夹中的Doc.idx DocId2Url.idx和Doc.idx

2、运行命�?sort Url.idx|uniq > Url.idx.sort_uniq //Data文�g夹中的Url.idx.sort_uniq
会用��C��个文�?Url.idx文�g //md5 hash 之后的url完整地址和document id值对
会��生一个文�?Url.idx.sort_uniq //URL消重�Q�md5 hash排序�Q�提高检索效�?/p>

3、运行命�?./DocSegment Tianwang.raw.2559638448
会用��C��个文�?Tianwang.raw.2559638448 //Tianwang.raw.2559638448为爬回来的文�?�Q�每个页面包含http��_��分词为后面徏立到排烦引做准备
会��生一个文�?Tianwang.raw.2559638448.seg //分词文�g�Q�由一行document id号和一行文档分词组�Q�只�Ҏ��个文�?lt;html>�?lt;head>�{�文字标��C��的文本进行分�l�）构成

4、运行命�?./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx //建立独立的正向烦�?/p>

5、运行命�?br>#set | grep "LANG"
#LANG=en; export LANG;
#sort moon.fidx > moon.fidx.sort

6、运行命�?./CrtInvertedIdx moon.fidx.sort > sun.iidx //建立倒排索引

我们先从建立索引的第一个程序DocIndex.cpp开始分析�?注释�U�定�Q�Tianwang.raw.2559638448是抓回来合�ƈ成的大文�Ӟ��后面��叫大文�Ӟ��里面包含了很多篇html文档�Q�里面的文��有规律的分隔��叫做一��一��的文��)

//DocIndex.h start-------------------------------------------------------------

#ifndef _COMM_H_040708_
#define _COMM_H_040708_

#include

#include
#include
#include
#include
#include
#include
#include

using namespace std;

const unsigned HEADER_BUF_SIZE = 1024;
const unsigned RstPerPage = 20; //前台搜烦�l�果数据集返回条�?/p>

//iceway
//const unsigned MAX_DOC_IDX_ID = 21312; //DocSegment.cpp中要用到
const unsigned MAX_DOC_IDX_ID = 22104;

//const string IMG_INFO_NAME("./Data/s1.1");
const string INF_INFO_NAME("./Data/sun.iidx"); //倒排索引文�g
//朱�d 14383 16151 16151 16151 1683 207 6302 7889 8218 8218 8637
//朱古�?nbsp; 1085 1222

//9万多�?字元文�g 包括�Ҏ��W�号�Q�标点，汉字
const string DOC_IDX_NAME("./Data/Doc.idx"); //倒排索引文�g
const string RAWPAGE_FILE_NAME("./Data/Tianwang.swu.iceway.1.0");

//iceway
const string DOC_FILE_NAME = "Tianwang.swu.iceway.1.0"; //Docindex.cpp中要用到
const string Data_DOC_FILE_NAME = "./Data/Tianwang.swu.iceway.1.0"; //Snapshot.cpp中要用到

//const string RM_THUMBNAIL_FILES("rm -f ~/public_html/ImgSE/timg/*");

//const string THUMBNAIL_DIR("/ImgSE/timg/");

#endif _COMM_H_040708_
//DocIndex.h end--------------------------------------------------------------//DocIndex.cpp start-----------------------------------------------------------

#include
#include
#include "Md5.h"
#include "Url.h"
#include "Document.h"

//iceway(mnsc)
#include "Comm.h"
#include

using namespace std;

int main(int argc, char* argv[])
{
    //ifstream ifs("Tianwang.raw.2559638448");
//ifstream ifs("Tianwang.raw.3023555472");
//iceway(mnsc)
ifstream ifs(DOC_FILE_NAME.c_str()); //打开Tianwang.raw.3023555472文�g�Q�最原始的文�?br> if (!ifs)
{
    cerr << "Cannot open " << "tianwang.img.info" << " for input\n";
    return -1;
    }
ofstream ofsUrl("Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //建立�q�打开Url.idx文�g
if( !ofsUrl )
{
  cout << "error open file " << endl;
}

ofstream ofsDoc("Doc.idx", ios::in|ios::out|ios::trunc|ios::binary); //建立�q�打开Doc.idx文�g
if( !ofsDoc )
{
cout << "error open file " << endl;
}

ofstream ofsDocId2Url("DocId2Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //建立�q�打开DocId2Url.idx文�g
if( !ofsDocId2Url )
{
cout << "error open file " << endl;
}

int cnt=0; //文��~�号�?开始计��?br> string strLine,strPage;
CUrl iUrl;
CDocument iDocument;
CMD5 iMD5;

int nOffset = ifs.tellg();
while (getline(ifs, strLine))
{
  if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
  {
   nOffset = ifs.tellg();
   continue;
  }

  if (!strncmp(strLine.c_str(), "version: 1.0", 12)) //判断�W�一行是否是version: 1.0如果是就解析下去
  {
   if(!getline(ifs, strLine)) break;
   if (!strncmp(strLine.c_str(), "url: ", 4)) //判断�W�二行是否是url: 如果是则解析下去
   {
    iUrl.m_sUrl = strLine.substr(5); //截取url: 五个字符之后的url内容
    iMD5.GenerateMD5( (unsigned char*)iUrl.m_sUrl.c_str(), iUrl.m_sUrl.size() ); //对url用md5 hash处理
    iUrl.m_sChecksum = iMD5.ToString(); //��字�W�数�l�组合成字符串这个函数在Md5.h中实�?/p>

   } else
   {
    continue;
   }

   while (getline(ifs, strLine))
   {
    if (!strncmp(strLine.c_str(), "length: ", 8)) //一直读下去直到判断�Ҏ��(相对�W�五�?惺欠袯��ength: 是则接下下去
    {
     sscanf(strLine.substr(8).c_str(), "%d", &(iDocument.m_nLength)); //��该块所代表�|�页的实际网��内定w��度放入iDocument数据�l�构�?br>     break;
    }
   }

getline(ifs, strLine); //跌��相对�W�六行故意留的一个空�?/p>

iDocument.m_nDocId = cnt; //��文档编可��值到iDocument数据�l�构�?br> iDocument.m_nPos = nOffset; //文��l�尾在大文�g中的�l�束行号
char *pContent = new char[iDocument.m_nLength+1]; //新徏该文��长度的字符串指�?/p>

   memset(pContent, 0, iDocument.m_nLength+1); //每一位初始化�?
   ifs.read(pContent, iDocument.m_nLength); //�Ҏ��获得的文��长度读取澹(其中包含协议�?��d��文档内容
   iMD5.GenerateMD5( (unsigned char*)pContent, iDocument.m_nLength );
   iDocument.m_sChecksum = iMD5.ToString(); //��字�W�数�l�组合成字符串这个函数在Md5.h中实�?br>
   delete[] pContent;

   ofsUrl << iUrl.m_sChecksum ; //��md5hash后的url写入Url.idx文�g
   ofsUrl << "\t" << iDocument.m_nDocId << endl; //在一行中一个tab距离分隔�Q�将文�g�~�号写入Url.idx文�g

   ofsDoc << iDocument.m_nDocId ; //��文件编号写入Doc.idx文�g
   ofsDoc << "\t" << iDocument.m_nPos ; //在一行中一个tab距离分隔�Q�将该文��结束行��h��(同样也是下一文档开始行�?写入Doc.idx文�g
   //ofsDoc << "\t" << iDocument.m_nLength ;
   ofsDoc << "\t" << iDocument.m_sChecksum << endl; //在一行中一个tab距离分隔�Q�将md5hash后的url写入Doc.idx文�g

ofsDocId2Url << iDocument.m_nDocId ; //��文件编号写入DocId2Url.idx文�g
ofsDocId2Url << "\t" << iUrl.m_sUrl << endl; //��该文��的完整url写入DocId2Url.idx文�g

cnt++; //文档�~�号加一说明该以文档分析完毕�Q�生成下一文��的编�?br> }

nOffset = ifs.tellg();

}

//最后一行只有文档号和上一��文档结束号
ofsDoc << cnt ;
ofsDoc << "\t" << nOffset << endl;

return(0);
}

//DocIndex.cpp end-----------------------------------------------------------author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

学者站在巨人的肩膀�?/a> 2009-12-10 23:00 发表评论

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析及完全注释[5]倒排索引的徏立及文�g介绍

Thu, 10 Dec 2009 14:55:00 GMT

不好意思让大家久等了，前一阵一直在忙考试�Q�终于结束了。呵呵！废话不多说了下面我们开始吧�Q?/p>

TSE用的是将抓取回来的网��|��档全部装入一个大文��Q�让后对�q�一个大文��内的数据整体�l�一的徏索引�Q�其中包含了几个步骤�?/p>

view plaincopy to clipboardprint?
1. The document index (Doc.idx) keeps information about each document.

It is a fixed width ISAM (Index sequential access mode) index, orderd by docID.

The information stored in each entry includes a pointer into the repository,

a document length, a document checksum.

//Doc.idx 文��~�号文��长度    checksum hash�?nbsp;

0   0   bc9ce846d7987c4534f53d423380ba70

1   76760   4f47a3cad91f7d35f4bb6b2a638420e5

2   141624 d019433008538f65329ae8e39b86026c

3   142350 5705b8f58110f9ad61b1321c52605795

//Doc.idx   end

The url index (url.idx) is used to convert URLs into docIDs.

//url.idx

5c36868a9c5117eadbda747cbdb0725f    0

3272e136dd90263ee306a835c6c70d77    1

6b8601bb3bb9ab80f868d549b5c5a5f3    2

3f9eba99fa788954b5ff7f35a5db6e1f    3

//url.idx   end

It is a list of URL checksums with their corresponding docIDs and is sorted by

checksum. In order to find the docID of a particular URL, the URL's checksum

is computed and a binary search is performed on the checksums file to find its

docID.

    ./DocIndex

        got Doc.idx, Url.idx, DocId2Url.idx //Data文�g夹中的Doc.idx DocId2Url.idx和Doc.idx�?nbsp;

//DocId2Url.idx

0   http://*.*.edu.cn/index.aspx

1   http://*.*.edu.cn/showcontent1.jsp?NewsID=118

2   http://*.*.edu.cn/0102.html

3   http://*.*.edu.cn/0103.html

//DocId2Url.idx end

2. sort Url.idx|uniq > Url.idx.sort_uniq    //Data文�g夹中的Url.idx.sort_uniq

//Url.idx.sort_uniq

//对hash��D��行排�?nbsp;

000bfdfd8b2dedd926b58ba00d40986b    1111

000c7e34b653b5135a2361c6818e48dc    1831

0019d12f438eec910a06a606f570fde8    366

0033f7c005ec776f67f496cd8bc4ae0d    2103

3. Segment document to terms, (with finding document according to the url)

    ./DocSegment Tianwang.raw.2559638448        //Tianwang.raw.2559638448为爬回来的文�?�Q�每个页面包含http�?nbsp;

        got Tianwang.raw.2559638448.seg

//Tianwang.raw.2559638448   爬取的原始网��|��件在文��内部每一个文��之间应该是通过version�Q?lt;/html>和回车做标志位分割的

version: 1.0

url: http://***.105.138.175/Default2.asp?lang=gb

origin: http://***.105.138.175/

date: Fri, 23 May 2008 20:01:36 GMT

ip: 162.105.138.175

length: 38413

HTTP/1.1 200 OK

Server: Microsoft-IIS/5.0

Date: Fri, 23 May 2008 11:17:49 GMT

Connection: keep-alive

Connection: Keep-Alive

Content-Length: 38088

Content-Type: text/html; Charset=gb2312

Expires: Fri, 23 May 2008 11:17:49 GMT

Set-Cookie: ASPSESSIONIDSSTRDCAB=IMEOMBIAIPDFCKPAEDJFHOIH; path=/

Cache-control: private

"





Apabi数字资源�q�_��























//Tianwang.raw.2559638448   end

//Tianwang.raw.2559638448.seg   ��每个页面分成一行如�?注意中间没有回�R作�ؓ分隔)

1

...

...

...

2

...

...

...

//Tianwang.raw.2559638448.seg   end

//下是 Tiny search 非必��d��?nbsp;

4. Create forward index (docic-->termid)     //建立正向索引

    ./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx

//Tianwang.raw.2559638448.seg ��每个页面分成一行如�?lt;BR>//分词   DocID
1
三星/ s/ 手机/ 论坛/ ,/ 手机/ 铃声/ 下蝲/ ,/ 手机/ 囄��/ 下蝲/ ,/ 手机/
2
...
...
...

1. The document index (Doc.idx) keeps information about each document.

It is a fixed width ISAM (Index sequential access mode) index, orderd by docID.

The information stored in each entry includes a pointer into the repository,

a document length, a document checksum.

//Doc.idx 文��~�号文档长度 checksum hash�?/p>

0 0 bc9ce846d7987c4534f53d423380ba70

1 76760 4f47a3cad91f7d35f4bb6b2a638420e5

2 141624 d019433008538f65329ae8e39b86026c

3 142350 5705b8f58110f9ad61b1321c52605795

//Doc.idx end

The url index (url.idx) is used to convert URLs into docIDs.

//url.idx

5c36868a9c5117eadbda747cbdb0725f 0

3272e136dd90263ee306a835c6c70d77 1

6b8601bb3bb9ab80f868d549b5c5a5f3 2

3f9eba99fa788954b5ff7f35a5db6e1f 3

//url.idx end

It is a list of URL checksums with their corresponding docIDs and is sorted by

checksum. In order to find the docID of a particular URL, the URL's checksum

is computed and a binary search is performed on the checksums file to find its

docID.

./DocIndex

got Doc.idx, Url.idx, DocId2Url.idx //Data文�g夹中的Doc.idx DocId2Url.idx和Doc.idx�?/p>

//DocId2Url.idx

0 http://*.*.edu.cn/index.aspx

1 http://*.*.edu.cn/showcontent1.jsp?NewsID=118

2 http://*.*.edu.cn/0102.html

3 http://*.*.edu.cn/0103.html

//DocId2Url.idx end

2. sort Url.idx|uniq > Url.idx.sort_uniq //Data文�g夹中的Url.idx.sort_uniq

//Url.idx.sort_uniq

//对hash��D��行排�?/p>

000bfdfd8b2dedd926b58ba00d40986b 1111

000c7e34b653b5135a2361c6818e48dc 1831

0019d12f438eec910a06a606f570fde8 366

0033f7c005ec776f67f496cd8bc4ae0d 2103

3. Segment document to terms, (with finding document according to the url)

./DocSegment Tianwang.raw.2559638448 //Tianwang.raw.2559638448为爬回来的文�?�Q�每个页面包含http�?/p>

got Tianwang.raw.2559638448.seg

//Tianwang.raw.2559638448 爬取的原始网��|��件在文��内部每一个文��之间应该是通过version�Q?lt;/html>和回车做标志位分割的

version: 1.0

url: http://***.105.138.175/Default2.asp?lang=gb

origin: http://***.105.138.175/

date: Fri, 23 May 2008 20:01:36 GMT

ip: 162.105.138.175

length: 38413

HTTP/1.1 200 OK

Server: Microsoft-IIS/5.0

Date: Fri, 23 May 2008 11:17:49 GMT

Connection: keep-alive

Connection: Keep-Alive

Content-Length: 38088

Content-Type: text/html; Charset=gb2312

Expires: Fri, 23 May 2008 11:17:49 GMT

Set-Cookie: ASPSESSIONIDSSTRDCAB=IMEOMBIAIPDFCKPAEDJFHOIH; path=/

Cache-control: private

Apabi数字资源�q�_��

//Tianwang.raw.2559638448 end

//Tianwang.raw.2559638448.seg ��每个页面分成一行如�?注意中间没有回�R作�ؓ分隔)

...

//Tianwang.raw.2559638448.seg end

//下是 Tiny search 非必��d��?/p>

4. Create forward index (docic-->termid) //建立正向索引

./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx

//Tianwang.raw.2559638448.seg ��每个页面分成一行如�?/分词   DocID1三星/ s/ 手机/ 论坛/ ,/ 手机/ 铃声/ 下蝲/ ,/ 手机/ 囄��/ 下蝲/ ,/ 手机/2.........view plaincopy to clipboardprint?
//Tianwang.raw.2559638448.seg end

//moon.fidx

//每篇文档号对应文��内分出来的    分词 DocID

都会 2391

�?nbsp; 2391

那些 2391

拥有 2391

�?nbsp; 2391

�?nbsp; 2391

�?nbsp; 2391

�?nbsp; 2391

视野 2391

�?nbsp; 2391

�H?nbsp; 2391

�?nbsp; 2180

研究生部    2180

主页 2180

培养 2180

��理 2180

栏目 2180

下蝲 2180

�Q?nbsp; 2180

�?nbsp; 2180

关于 2180

做好 2180

�q?nbsp; 2180

国家 2180

公派 2180

研究�?2180

��目 2180

//moon.fidx end

5.# set | grep "LANG"

LANG=en; export LANG;

sort moon.fidx > moon.fidx.sort

6. Create inverted index (termid-->docid)    //建立倒排索引

    ./CrtInvertedIdx moon.fidx.sort > sun.iidx

//sun.iidx //文�g规模大概减少1/2

花工   236

花�v   2103

花卉   1018 1061 1061 1061 1730 1730 1730 1730 1730 1852 949 949

��p��   447 447

花木   1061

花呢   1430

花期   447 447 447 447 447 525

花钱   174 236

��p��   1730 1730

��p��品种     1660

��q��   450 526

花式   1428 1430 1430 1430

��q��   1430 1430

花序   447 447 447 447 447 450

��q�Q   136 137

��p��   450 450

//sun.iidx end

TSESearch   CGI program for query

Snapshot    CGI program for page snapshot

author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

学者站在巨人的肩膀�?/a> 2009-12-10 22:55 发表评论

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析及完全注释[4]��结

Thu, 10 Dec 2009 14:54:00 GMT

通过前面的三��文章相信你已经对神�U�的搜烦引擎有了一个感性的认识�Q�和普通的php�c�M��的脚本语�a�服务器类��|��通过获取前台关键字，通过字典分词�Q�和事先建立建立好的倒排索引�q�行相关性分析，得出查询�l�构格式化输出结果。而这里的技术难点在�?/p>

1、字典的选取�Q�事实上�Ҏ��不同时代不同地方��Z��的语�a�习惯是不一��L��所以说字典的最��元的取值是不同的）

2、倒排索引的徏立（�q�里��p��涉及到爬虫的抓取和烦引的建立后面��重点介�l�这2点，搜烦引擎的效率和服务质量实效性瓶颈在�q�里�Q?/p>

3、相��x��分析（�Ҏ��回来的文档分词徏索引和用户关键字分词��法上要对应�Q?/p>

后面文章会重点介�l�爬虫的抓取和烦引的建立�?/p>

学者站在巨人的肩膀�?/a> 2009-12-10 22:54 发表评论

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析及完全注释[3]来到关键字分词及相关性分析程�?

Thu, 10 Dec 2009 14:53:00 GMT

有前面注释我们可以知道查询关键字和字典文件准备好好后�Q�将�q�入用户关键字分词阶�D?/p>

//TSESearch.cpp中：

view plaincopy to clipboardprint?
CHzSeg iHzSeg;      //include ChSeg/HzSeg.h

//
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //��get到的查询变量分词分成 "�?        �?      你们/ �?      格式"

vector vecTerm;
iQuery.ParseQuery(vecTerm);     //��以"/"划分开的关键字一一��序攑օ�一个向量容器中

set setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);

gettimeofday(&end_tv,&tz);
// search end
//搜烦完毕

CHzSeg iHzSeg; //include ChSeg/HzSeg.h

//
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //��get到的查询变量分词分成 "�? �? 你们/ �? 格式"

vector vecTerm;
iQuery.ParseQuery(vecTerm); //��以"/"划分开的关键字一一��序攑օ�一个向量容器中

set setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);

gettimeofday(&end_tv,&tz);
// search end
//搜烦完毕view plaincopy to clipboardprint?
看CHzSeg 中的�q�个�Ҏ��

看CHzSeg 中的�q�个�Ҏ��view plaincopy to clipboardprint?
//ChSeg/HzSeg.h

//ChSeg/HzSeg.hview plaincopy to clipboardprint?
/**
* �E�序��译说明
* �q�一步净化数据，转换汉字
* @access public
* @param   CDict, string 参数的汉字说�?字典�Q�查询字�W�串
* @return string 0
*/
// process a sentence before segmentation
//在分词前处理句子
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
{
    string s2="";
    unsigned int i,len;

    while (!s1.empty())
    {
        unsigned char ch=(unsigned char) s1[0];
        if(ch<128)
        { // deal with ASCII
            i=1;
            len = s1.size();
            while (i=161)
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
              && (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
              || (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
              || (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
                {
                    ii=i+2; // 假定没有半个汉字
                }

                if (i==0) ii=i+2;

                // 不处理中文空�?nbsp;
                if (!(ch==161 && (unsigned char)s1[1]==161))
                {
                    if (i <= s1.size()) // yhf
                        // 其他的非汉字双字节字�W�可能连�l�输�?nbsp;
                        s2 += s1.substr(0, i) + SEPARATOR;
                    else break; // yhf
                }

                if (i <= s1.size()) // yhf
                    s1s1=s1.substr(i);
                else break;     //yhf

                continue;
            }
        }


    // 以下处理汉字�?nbsp;

        i = 2;
        len = s1.length();

        while(i=176)
//    while(i=128 && (unsigned char)s1[i]!=161)
            i+=2;

        s2+=SegmentHzStrMM(dict, s1.substr(0,i));

        if (i <= len)    // yhf
            s1s1=s1.substr(i);
        else break; // yhf
    }

    return s2;
}

/**
* �E�序��译说明
* �q�一步净化数据，转换汉字
* @access public
* @param CDict, string 参数的汉字说�?字典�Q�查询字�W�串
* @return string 0
*/
// process a sentence before segmentation
//在分词前处理句子
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
{
string s2="";
unsigned int i,len;

while (!s1.empty())
{
  unsigned char ch=(unsigned char) s1[0];
  if(ch<128)
  { // deal with ASCII
   i=1;
   len = s1.size();
   while (i=161)
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
              && (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
              || (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
              || (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
    {
     i=i+2; // 假定没有半个汉字
    }

if (i==0) i=i+2;

    // 不处理中文空�?br>    if (!(ch==161 && (unsigned char)s1[1]==161))
    {
     if (i <= s1.size()) // yhf
      // 其他的非汉字双字节字�W�可能连�l�输�?br>      s2 += s1.substr(0, i) + SEPARATOR;
     else break; // yhf
    }

    if (i <= s1.size()) // yhf
     s1=s1.substr(i);
    else break;  //yhf

    continue;
   }
  }

// 以下处理汉字�?/p>

i = 2;
len = s1.length();

  while(i=176)
//    while(i=128 && (unsigned char)s1[i]!=161)
   i+=2;

s2+=SegmentHzStrMM(dict, s1.substr(0,i));

  if (i <= len) // yhf
   s1=s1.substr(i);
  else break; // yhf
}

return s2;
}view plaincopy to clipboardprint?

view plaincopy to clipboardprint?
//Query.cpp

//Query.cppview plaincopy to clipboardprint?

/**   
 * �E�序���译说明   
 * ���以"/"划分开的关键字一一��序攑օ�一个向量容器中   
 *   
 * @access  public   
 * @param   vector 参数的汉字说明：向量容器   
 * @return  void   
 */   
void CQuery::ParseQuery(vector &vecTerm)   
{   
    string::size_type idx;    
    while ( (idx = m_sSegQuery.find("/  ")) != string::npos ) {    
        vecTerm.push_back(m_sSegQuery.substr(0,idx));    
        m_sSegQuerym_sSegQuery = m_sSegQuery.substr(idx+3);    
    }   
}

/**   
 * �E�序���译说明   
 * 相关性分析查询，构造结果集合setRelevantRst //瓉���所�?nbsp;  
 *   
 * @access  public   
 * @param   vector map set 参数的汉字说明： 用户提交关键字的分词�l�，倒排索引映射�Q�相��x��结果集�?nbsp;  
 * @return  string 0   
 */   
bool CQuery::GetRelevantRst   
(   
    vector &vecTerm,    
    map &mapBuckets,    
    set &setRelevantRst   
) const   
{   
    set setSRst;   
  
    bool bFirst=true;   
    vector::iterator itTerm = vecTerm.begin();   
  
    for ( ; itTerm != vecTerm.end(); ++itTerm )   
    {   
  
        setSRst.clear();   
        copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));   
  
        map mapRstDoc;   
        string docid;   
        int doccnt;   
  
        map::iterator itBuckets = mapBuckets.find(*itTerm);   
        if (itBuckets != mapBuckets.end())   
        {   
            string strBucket = (*itBuckets).second;   
            string::size_type idx;   
            idx = strBucket.find_first_not_of(" ");   
            strBucketstrBucket = strBucket.substr(idx);   
  
            while ( (idx = strBucket.find(" ")) != string::npos )    
            {   
                docid = strBucket.substr(0,idx);   
                doccnt = 0;   
  
                if (docid.empty()) continue;   
  
                map::iterator it = mapRstDoc.find(docid);   
                if ( it != mapRstDoc.end() )   
                {   
                    doccnt = (*it).second + 1;   
                    mapRstDoc.erase(it);   
                }   
                mapRstDoc.insert( pair(docid,doccnt) );   
  
                strBucketstrBucket = strBucket.substr(idx+1);   
            }   
  
            // remember the last one   
            docid = strBucket;   
            doccnt = 0;   
            map::iterator it = mapRstDoc.find(docid);   
            if ( it != mapRstDoc.end() )   
            {   
                doccnt = (*it).second + 1;   
                mapRstDoc.erase(it);   
            }   
            mapRstDoc.insert( pair(docid,doccnt) );   
        }   
  
        // sort by term frequencty   
        multimap > newRstDoc;   
        map::iterator it0 = mapRstDoc.begin();   
        for ( ; it0 != mapRstDoc.end(); ++it0 ){   
            newRstDoc.insert( pair((*it0).second,(*it0).first) );   
        }   
  
        multimap::iterator itNewRstDoc = newRstDoc.begin();   
        setRelevantRst.clear();   
        for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){   
            string docid = (*itNewRstDoc).second;   
  
            if (bFirst==true) {   
                setRelevantRst.insert(docid);   
                continue;   
            }   
  
            if ( setSRst.find(docid) != setSRst.end() ){       
                setRelevantRst.insert(docid);   
            }   
        }   
  
        //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "
";   
        bFirst = false;   
    }   
    return true;   
}

接下来的��是现实了，前面都只是处理数据得�?setRelevantRst �q�个查询�l�构集合,�q�里��׃��多说了下面就和php之类的脚本语�a�差不多，格式化结果集合�ƈ昄��出来�?nbsp;

view plaincopy to clipboardprint?/**   * �E�序��译说明   * ��以"/"划分开的关键字一一��序攑օ�一个向量容器中   *   * @access public   * @param   vector 参数的汉字说明：向量容器   * @return void   */ void CQuery::ParseQuery(vector &vecTerm)   {       string::size_type idx;        while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) {            vecTerm.push_back(m_sSegQuery.substr(0,idx));            m_sSegQuery = m_sSegQuery.substr(idx+3);        }   } /**
* �E�序��译说明
* ��以"/"划分开的关键字一一��序攑օ�一个向量容器中
*
* @access public
* @param   vector 参数的汉字说明：向量容器
* @return void
*/
void CQuery::ParseQuery(vector &vecTerm)
{
string::size_type idx;
while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) {
  vecTerm.push_back(m_sSegQuery.substr(0,idx));
  m_sSegQuery = m_sSegQuery.substr(idx+3);
}
}

view plaincopy to clipboardprint?
view plaincopy to clipboardprint?

/**   * �E�序���译说明   * 相关性分析查询，构造结果集合setRelevantRst //瓉���所�?nbsp;  *   * @access  public   * @param   vector map set 参数的汉字说明： 用户提交关键字的分词�l�，倒排索引映射�Q�相��x��结果集�?nbsp;  * @return  string 0   */  bool CQuery::GetRelevantRst   (       vector &vecTerm,        map &mapBuckets,        set &setRelevantRst   ) const  {       set setSRst;         bool bFirst=true;       vector::iterator itTerm = vecTerm.begin();         for ( ; itTerm != vecTerm.end(); ++itTerm )       {             setSRst.clear();           copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));             map mapRstDoc;           string docid;           int doccnt;             map::iterator itBuckets = mapBuckets.find(*itTerm);           if (itBuckets != mapBuckets.end())           {               string strBucket = (*itBuckets).second;               string::size_type idx;               idx = strBucket.find_first_not_of(" ");               strBucket = strBucket.substr(idx);                 while ( (idx = strBucket.find(" ")) != string::npos )                {                   docid = strBucket.substr(0,idx);                   doccnt = 0;                     if (docid.empty()) continue;                     map::iterator it = mapRstDoc.find(docid);                   if ( it != mapRstDoc.end() )                   {                       doccnt = (*it).second + 1;                       mapRstDoc.erase(it);                   }                   mapRstDoc.insert( pair(docid,doccnt) );                     strBucket = strBucket.substr(idx+1);               }                 // remember the last one               docid = strBucket;               doccnt = 0;               map::iterator it = mapRstDoc.find(docid);               if ( it != mapRstDoc.end() )               {                   doccnt = (*it).second + 1;                   mapRstDoc.erase(it);               }               mapRstDoc.insert( pair(docid,doccnt) );           }             // sort by term frequencty           multimap > newRstDoc;           map::iterator it0 = mapRstDoc.begin();           for ( ; it0 != mapRstDoc.end(); ++it0 ){               newRstDoc.insert( pair((*it0).second,(*it0).first) );           }             multimap::iterator itNewRstDoc = newRstDoc.begin();           setRelevantRst.clear();           for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){               string docid = (*itNewRstDoc).second;                 if (bFirst==true) {                   setRelevantRst.insert(docid);                   continue;               }                 if ( setSRst.find(docid) != setSRst.end() ){                       setRelevantRst.insert(docid);               }           }             //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "
";           bFirst = false;       }       return true;   }

view plaincopy to clipboardprint?/**   * �E�序��译说明   * 相关性分析查询，构造结果集合setRelevantRst //瓉��所�?nbsp; *   * @access public   * @param   vector map set 参数的汉字说明：用户提交关键字的分词�l�，倒排索引映射�Q�相��x��结果集�?nbsp; * @return string 0   */ bool CQuery::GetRelevantRst   (       vector &vecTerm,        map &mapBuckets,        set &setRelevantRst   ) const {       set setSRst;         bool bFirst=true;       vector::iterator itTerm = vecTerm.begin();         for ( ; itTerm != vecTerm.end(); ++itTerm )       {             setSRst.clear();           copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));             map mapRstDoc;           string docid;           int doccnt;             map::iterator itBuckets = mapBuckets.find(*itTerm);           if (itBuckets != mapBuckets.end())           {               string strBucket = (*itBuckets).second;               string::size_type idx;               idx = strBucket.find_first_not_of(" ");               strBucket = strBucket.substr(idx);                 while ( (idx = strBucket.find(" ")) != string::npos )                {                   docid = strBucket.substr(0,idx);                   doccnt = 0;                     if (docid.empty()) continue;                     map::iterator it = mapRstDoc.find(docid);                   if ( it != mapRstDoc.end() )                   {                       doccnt = (*it).second + 1;                       mapRstDoc.erase(it);                   }                   mapRstDoc.insert( pair(docid,doccnt) );                     strBucket = strBucket.substr(idx+1);               }                 // remember the last one               docid = strBucket;               doccnt = 0;               map::iterator it = mapRstDoc.find(docid);               if ( it != mapRstDoc.end() )               {                   doccnt = (*it).second + 1;                   mapRstDoc.erase(it);               }               mapRstDoc.insert( pair(docid,doccnt) );           }             // sort by term frequencty           multimap > newRstDoc;           map::iterator it0 = mapRstDoc.begin();           for ( ; it0 != mapRstDoc.end(); ++it0 ){               newRstDoc.insert( pair((*it0).second,(*it0).first) );           }             multimap::iterator itNewRstDoc = newRstDoc.begin();           setRelevantRst.clear();           for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){               string docid = (*itNewRstDoc).second;                 if (bFirst==true) {                   setRelevantRst.insert(docid);                   continue;               }                 if ( setSRst.find(docid) != setSRst.end() ){                       setRelevantRst.insert(docid);               }           }             //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "
";           bFirst = false;       }       return true;   } /**
* �E�序��译说明
* 相关性分析查询，构造结果集合setRelevantRst //瓉��所�?br> *
* @access public
* @param   vector map set 参数的汉字说明：用户提交关键字的分词�l�，倒排索引映射�Q�相��x��结果集�?br> * @return string 0
*/
bool CQuery::GetRelevantRst
(
vector &vecTerm,
map &mapBuckets,
set &setRelevantRst
) const
{
set setSRst;

bool bFirst=true;
vector::iterator itTerm = vecTerm.begin();

for ( ; itTerm != vecTerm.end(); ++itTerm )
{

setSRst.clear();
copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));

  map mapRstDoc;
  string docid;
  int doccnt;

  map::iterator itBuckets = mapBuckets.find(*itTerm);
  if (itBuckets != mapBuckets.end())
  {
   string strBucket = (*itBuckets).second;
   string::size_type idx;
   idx = strBucket.find_first_not_of(" ");
   strBucket = strBucket.substr(idx);

   while ( (idx = strBucket.find(" ")) != string::npos )
   {
    docid = strBucket.substr(0,idx);
    doccnt = 0;

if (docid.empty()) continue;

    map::iterator it = mapRstDoc.find(docid);
    if ( it != mapRstDoc.end() )
    {
     doccnt = (*it).second + 1;
     mapRstDoc.erase(it);
    }
    mapRstDoc.insert( pair(docid,doccnt) );

strBucket = strBucket.substr(idx+1);
}

   // remember the last one
   docid = strBucket;
   doccnt = 0;
   map::iterator it = mapRstDoc.find(docid);
   if ( it != mapRstDoc.end() )
   {
    doccnt = (*it).second + 1;
    mapRstDoc.erase(it);
   }
   mapRstDoc.insert( pair(docid,doccnt) );
  }

  // sort by term frequencty
  multimap > newRstDoc;
  map::iterator it0 = mapRstDoc.begin();
  for ( ; it0 != mapRstDoc.end(); ++it0 ){
   newRstDoc.insert( pair((*it0).second,(*it0).first) );
  }

  multimap::iterator itNewRstDoc = newRstDoc.begin();
  setRelevantRst.clear();
  for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){
   string docid = (*itNewRstDoc).second;

   if (bFirst==true) {
    setRelevantRst.insert(docid);
    continue;
   }

   if ( setSRst.find(docid) != setSRst.end() ){
    setRelevantRst.insert(docid);
   }
  }

//cout << "setRelevantRst.size(): " << setRelevantRst.size() << "";
bFirst = false;
}
return true;
}

接下来的��是现实了，前面都只是处理数据得�?setRelevantRst �q�个查询�l�构集合,�q�里��׃��多说了下面就和php之类的脚本语�a�差不多，格式化结果集合�ƈ昄��出来�?br>//TSESearch.cpp

view plaincopy to clipboardprint?
//下面开始显�C?nbsp;
    CDisplayRst iDisplayRst;
    iDisplayRst.ShowTop();

    float used_msec = (end_tv.tv_sec-begin_tv.tv_sec)*1000
        +((float)(end_tv.tv_usec-begin_tv.tv_usec))/(float)1000;

    iDisplayRst.ShowMiddle(iQuery.m_sQuery,used_msec,
            setRelevantRst.size(), iQuery.m_iStart);

    iDisplayRst.ShowBelow(vecTerm,setRelevantRst,vecDocIdx,iQuery.m_iStart);

学者站在巨人的肩膀�?/a> 2009-12-10 22:53 发表评论

香蕉免费一区二区三区在线观看,国产亚洲精品美女,久久精品久久综合

[分布式跨�q�_��监控�pȝ��]linux下监控网�l�流量和�|��?python脚本应用

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析及完全注释[6]倒排索引的徏立的�E�序分析(4)

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析及完全注释[6]倒排索引的徏立的�E�序分析(3)

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析及完全注释[6]倒排索引的徏立的�E�序分析(2)

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析及完全注释[6]倒排索引的徏立的�E�序分析(1)

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析及完全注释[5]倒排索引的徏立及文�g介绍

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析及完全注释[4]���结

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析及完全注释[3]来到关键字分词及相关性分析程�?

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析及完全注释[4]��结