• <ins id="pjuwb"></ins>
    <blockquote id="pjuwb"><pre id="pjuwb"></pre></blockquote>
    <noscript id="pjuwb"></noscript>
          <sup id="pjuwb"><pre id="pjuwb"></pre></sup>
            <dd id="pjuwb"></dd>
            <abbr id="pjuwb"></abbr>

            學(xué)著站在巨人的肩膀上

            金融數(shù)學(xué),InformationSearch,Compiler,OS,

              C++博客 :: 首頁(yè) :: 新隨筆 :: 聯(lián)系 :: 聚合  :: 管理 ::
              12 隨筆 :: 0 文章 :: 8 評(píng)論 :: 0 Trackbacks

            有前面注釋我們可以知道查詢關(guān)鍵字和字典文件準(zhǔn)備好好后,將進(jìn)入用戶關(guān)鍵字分詞階段

            //TSESearch.cpp中:

            view plaincopy to clipboardprint?
            CHzSeg iHzSeg;      //include ChSeg/HzSeg.h  
             
            //  
            iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery);  //將get到的查詢變量分詞分成 "我/        愛/      你們/ 的/      格式"  
             
            vector<STRING></STRING> vecTerm;  
            iQuery.ParseQuery(vecTerm);     //將以"/"劃分開的關(guān)鍵字一一順序放入一個(gè)向量容器中  
             
            set<STRING></STRING> setRelevantRst;   
            iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);   
             
            gettimeofday(&end_tv,&tz);  
            // search end  
            //搜索完畢 

             CHzSeg iHzSeg;  //include ChSeg/HzSeg.h

             //
             iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //將get到的查詢變量分詞分成 "我/  愛/  你們/ 的/  格式"
             
             vector vecTerm;
             iQuery.ParseQuery(vecTerm);  //將以"/"劃分開的關(guān)鍵字一一順序放入一個(gè)向量容器中
             
             set setRelevantRst;
             iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
             
             gettimeofday(&end_tv,&tz);
             // search end
             //搜索完畢view plaincopy to clipboardprint?
            看CHzSeg 中的這個(gè)方法 

            看CHzSeg 中的這個(gè)方法view plaincopy to clipboardprint?
            //ChSeg/HzSeg.h 

            //ChSeg/HzSeg.hview plaincopy to clipboardprint?
            /**  
             * 程序翻譯說明  
             * 進(jìn)一步凈化數(shù)據(jù),轉(zhuǎn)換漢字  
             * @access  public  
             * @param   CDict, string 參數(shù)的漢字說明:字典,查詢字符串  
             * @return  string 0  
             */  
            // process a sentence before segmentation  
            //在分詞前處理句子  
            string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const  
            {  
                string s2="";  
                unsigned int i,len;  
             
                while (!s1.empty())   
                {  
                    unsigned char ch=(unsigned char) s1[0];  
                    if(ch<128)   
                    { // deal with ASCII  
                        i=1;  
                        len = s1.size();  
                        while (i<LEN len="s1.length();" i="0;" 中文標(biāo)點(diǎn)等非漢字字符="" if="" else="" yhf="" s1="s1.substr(i);" by="" added="" ch="=13)" s2="" cr=""></LEN>=161)  
                          && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))  
                          && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))  
                          && (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)   
                          || (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186  
                          || (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))   
                            {   
                                ii=i+2; // 假定沒有半個(gè)漢字  
                            }  
             
                            if (i==0) ii=i+2;  
             
                            // 不處理中文空格  
                            if (!(ch==161 && (unsigned char)s1[1]==161))   
                            {   
                                if (i <= s1.size())  // yhf  
                                    // 其他的非漢字雙字節(jié)字符可能連續(xù)輸出  
                                    s2 += s1.substr(0, i) + SEPARATOR;   
                                else break; // yhf  
                            }  
             
                            if (i <= s1.size())  // yhf  
                                s1s1=s1.substr(i);  
                            else break;     //yhf  
             
                            continue;  
                        }  
                    }  
                  
             
                // 以下處理漢字串  
             
                    i = 2;  
                    len = s1.length();  
             
                    while(i<LEN></LEN>=176)   
            //    while(i<LEN></LEN>=128 && (unsigned char)s1[i]!=161)  
                        i+=2;  
             
                    s2+=SegmentHzStrMM(dict, s1.substr(0,i));  
             
                    if (i <= len)    // yhf  
                        s1s1=s1.substr(i);  
                    else break; // yhf  
                }  
             
                return s2;  

            /**
             * 程序翻譯說明
             * 進(jìn)一步凈化數(shù)據(jù),轉(zhuǎn)換漢字
             * @access  public
             * @param   CDict, string 參數(shù)的漢字說明:字典,查詢字符串
             * @return  string 0
             */
            // process a sentence before segmentation
            //在分詞前處理句子
            string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
            {
             string s2="";
             unsigned int i,len;

             while (!s1.empty())
             {
              unsigned char ch=(unsigned char) s1[0];
              if(ch<128)
              { // deal with ASCII
               i=1;
               len = s1.size();
               while (i=161)
                          && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
                          && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
                          && (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
                          || (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
                          || (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
                {
                 i=i+2; // 假定沒有半個(gè)漢字
                }

                if (i==0) i=i+2;

                // 不處理中文空格
                if (!(ch==161 && (unsigned char)s1[1]==161))
                {
                 if (i <= s1.size()) // yhf
                  // 其他的非漢字雙字節(jié)字符可能連續(xù)輸出
                  s2 += s1.substr(0, i) + SEPARATOR;
                 else break; // yhf
                }

                if (i <= s1.size()) // yhf
                 s1=s1.substr(i);
                else break;  //yhf

                continue;
               }
              }
               

                // 以下處理漢字串

              i = 2;
              len = s1.length();

              while(i=176)
            //    while(i=128 && (unsigned char)s1[i]!=161)
               i+=2;

              s2+=SegmentHzStrMM(dict, s1.substr(0,i));

              if (i <= len) // yhf
               s1=s1.substr(i);
              else break; // yhf
             }

             return s2;
            }view plaincopy to clipboardprint?
              

             view plaincopy to clipboardprint?
            //Query.cpp 

            //Query.cppview plaincopy to clipboardprint?
            <PRE class=csharp name="code">/**  
             * 程序翻譯說明  
             * 將以"/"劃分開的關(guān)鍵字一一順序放入一個(gè)向量容器中  
             *  
             * @access  public  
             * @param   vector<STRING></STRING> 參數(shù)的漢字說明:向量容器  
             * @return  void  
             */  
            void CQuery::ParseQuery(vector<STRING></STRING> &vecTerm)  
            {  
                string::size_type idx;   
                while ( (idx = m_sSegQuery.find("/  ")) != string::npos ) {   
                    vecTerm.push_back(m_sSegQuery.substr(0,idx));   
                    m_sSegQuerym_sSegQuery = m_sSegQuery.substr(idx+3);   
                }  
            }  
            </PRE> 
            <PRE class=csharp name="code"> </PRE> 
            <PRE class=csharp name="code"><PRE class=csharp name="code">/**  
             * 程序翻譯說明  
             * 相關(guān)性分析查詢,構(gòu)造結(jié)果集合setRelevantRst //瓶頸所在  
             *  
             * @access  public  
             * @param   vector<STRING></STRING> map set<STRING></STRING> 參數(shù)的漢字說明: 用戶提交關(guān)鍵字的分詞組,倒排索引映射,相關(guān)性結(jié)果集合  
             * @return  string 0  
             */  
            bool CQuery::GetRelevantRst  
            (  
                vector<STRING></STRING> &vecTerm,   
                map &mapBuckets,   
                set<STRING></STRING> &setRelevantRst  
            ) const  
            {  
                set<STRING></STRING> setSRst;  
             
                bool bFirst=true;  
                vector<STRING></STRING>::iterator itTerm = vecTerm.begin();  
             
                for ( ; itTerm != vecTerm.end(); ++itTerm )  
                {  
             
                    setSRst.clear();  
                    copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));  
             
                    map mapRstDoc;  
                    string docid;  
                    int doccnt;  
             
                    map::iterator itBuckets = mapBuckets.find(*itTerm);  
                    if (itBuckets != mapBuckets.end())  
                    {  
                        string strBucket = (*itBuckets).second;  
                        string::size_type idx;  
                        idx = strBucket.find_first_not_of(" ");  
                        strBucketstrBucket = strBucket.substr(idx);  
             
                        while ( (idx = strBucket.find(" ")) != string::npos )   
                        {  
                            docid = strBucket.substr(0,idx);  
                            doccnt = 0;  
             
                            if (docid.empty()) continue;  
             
                            map::iterator it = mapRstDoc.find(docid);  
                            if ( it != mapRstDoc.end() )  
                            {  
                                doccnt = (*it).second + 1;  
                                mapRstDoc.erase(it);  
                            }  
                            mapRstDoc.insert( pair(docid,doccnt) );  
             
                            strBucketstrBucket = strBucket.substr(idx+1);  
                        }  
             
                        // remember the last one  
                        docid = strBucket;  
                        doccnt = 0;  
                        map::iterator it = mapRstDoc.find(docid);  
                        if ( it != mapRstDoc.end() )  
                        {  
                            doccnt = (*it).second + 1;  
                            mapRstDoc.erase(it);  
                        }  
                        mapRstDoc.insert( pair(docid,doccnt) );  
                    }  
             
                    // sort by term frequencty  
                    multimap > newRstDoc;  
                    map::iterator it0 = mapRstDoc.begin();  
                    for ( ; it0 != mapRstDoc.end(); ++it0 ){  
                        newRstDoc.insert( pair((*it0).second,(*it0).first) );  
                    }  
             
                    multimap::iterator itNewRstDoc = newRstDoc.begin();  
                    setRelevantRst.clear();  
                    for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){  
                        string docid = (*itNewRstDoc).second;  
             
                        if (bFirst==true) {  
                            setRelevantRst.insert(docid);  
                            continue;  
                        }  
             
                        if ( setSRst.find(docid) != setSRst.end() ){      
                            setRelevantRst.insert(docid);  
                        }  
                    }  
             
                    //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";  
                    bFirst = false;  
                }  
                return true;  
            }</PRE> 
            </PRE> 
            接下來的就是現(xiàn)實(shí)了,前面都只是處理數(shù)據(jù)得到 setRelevantRst 這個(gè)查詢結(jié)構(gòu)集合,這里就不多說了下面就和php之類的腳本語(yǔ)言差不多,格式化結(jié)果集合并顯示出來。 

            view plaincopy to clipboardprint?/**   * 程序翻譯說明   * 將以"/"劃分開的關(guān)鍵字一一順序放入一個(gè)向量容器中   *   * @access  public   * @param   vector<STRING></STRING> 參數(shù)的漢字說明:向量容器   * @return  void   */  void CQuery::ParseQuery(vector<STRING></STRING> &vecTerm)   {       string::size_type idx;        while ( (idx = m_sSegQuery.find("/  ")) != string::npos ) {            vecTerm.push_back(m_sSegQuery.substr(0,idx));            m_sSegQuery = m_sSegQuery.substr(idx+3);        }   }  /**
             * 程序翻譯說明
             * 將以"/"劃分開的關(guān)鍵字一一順序放入一個(gè)向量容器中
             *
             * @access  public
             * @param   vector 參數(shù)的漢字說明:向量容器
             * @return  void
             */
            void CQuery::ParseQuery(vector &vecTerm)
            {
             string::size_type idx;
             while ( (idx = m_sSegQuery.find("/  ")) != string::npos ) {
              vecTerm.push_back(m_sSegQuery.substr(0,idx));
              m_sSegQuery = m_sSegQuery.substr(idx+3);
             }
            }

            view plaincopy to clipboardprint?   
            view plaincopy to clipboardprint?<PRE class=csharp name="code">/**   * 程序翻譯說明   * 相關(guān)性分析查詢,構(gòu)造結(jié)果集合setRelevantRst //瓶頸所在   *   * @access  public   * @param   vector<STRING></STRING> map set<STRING></STRING> 參數(shù)的漢字說明: 用戶提交關(guān)鍵字的分詞組,倒排索引映射,相關(guān)性結(jié)果集合   * @return  string 0   */  bool CQuery::GetRelevantRst   (       vector<STRING></STRING> &vecTerm,        map &mapBuckets,        set<STRING></STRING> &setRelevantRst   ) const  {       set<STRING></STRING> setSRst;         bool bFirst=true;       vector<STRING></STRING>::iterator itTerm = vecTerm.begin();         for ( ; itTerm != vecTerm.end(); ++itTerm )       {             setSRst.clear();           copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));             map mapRstDoc;           string docid;           int doccnt;             map::iterator itBuckets = mapBuckets.find(*itTerm);           if (itBuckets != mapBuckets.end())           {               string strBucket = (*itBuckets).second;               string::size_type idx;               idx = strBucket.find_first_not_of(" ");               strBucket = strBucket.substr(idx);                 while ( (idx = strBucket.find(" ")) != string::npos )                {                   docid = strBucket.substr(0,idx);                   doccnt = 0;                     if (docid.empty()) continue;                     map::iterator it = mapRstDoc.find(docid);                   if ( it != mapRstDoc.end() )                   {                       doccnt = (*it).second + 1;                       mapRstDoc.erase(it);                   }                   mapRstDoc.insert( pair(docid,doccnt) );                     strBucket = strBucket.substr(idx+1);               }                 // remember the last one               docid = strBucket;               doccnt = 0;               map::iterator it = mapRstDoc.find(docid);               if ( it != mapRstDoc.end() )               {                   doccnt = (*it).second + 1;                   mapRstDoc.erase(it);               }               mapRstDoc.insert( pair(docid,doccnt) );           }             // sort by term frequencty           multimap > newRstDoc;           map::iterator it0 = mapRstDoc.begin();           for ( ; it0 != mapRstDoc.end(); ++it0 ){               newRstDoc.insert( pair((*it0).second,(*it0).first) );           }             multimap::iterator itNewRstDoc = newRstDoc.begin();           setRelevantRst.clear();           for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){               string docid = (*itNewRstDoc).second;                 if (bFirst==true) {                   setRelevantRst.insert(docid);                   continue;               }                 if ( setSRst.find(docid) != setSRst.end() ){                       setRelevantRst.insert(docid);               }           }             //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";           bFirst = false;       }       return true;   }</PRE>  view plaincopy to clipboardprint?/**   * 程序翻譯說明   * 相關(guān)性分析查詢,構(gòu)造結(jié)果集合setRelevantRst //瓶頸所在   *   * @access  public   * @param   vector<STRING></STRING> map set<STRING></STRING> 參數(shù)的漢字說明: 用戶提交關(guān)鍵字的分詞組,倒排索引映射,相關(guān)性結(jié)果集合   * @return  string 0   */  bool CQuery::GetRelevantRst   (       vector<STRING></STRING> &vecTerm,        map &mapBuckets,        set<STRING></STRING> &setRelevantRst   ) const  {       set<STRING></STRING> setSRst;         bool bFirst=true;       vector<STRING></STRING>::iterator itTerm = vecTerm.begin();         for ( ; itTerm != vecTerm.end(); ++itTerm )       {             setSRst.clear();           copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));             map mapRstDoc;           string docid;           int doccnt;             map::iterator itBuckets = mapBuckets.find(*itTerm);           if (itBuckets != mapBuckets.end())           {               string strBucket = (*itBuckets).second;               string::size_type idx;               idx = strBucket.find_first_not_of(" ");               strBucket = strBucket.substr(idx);                 while ( (idx = strBucket.find(" ")) != string::npos )                {                   docid = strBucket.substr(0,idx);                   doccnt = 0;                     if (docid.empty()) continue;                     map::iterator it = mapRstDoc.find(docid);                   if ( it != mapRstDoc.end() )                   {                       doccnt = (*it).second + 1;                       mapRstDoc.erase(it);                   }                   mapRstDoc.insert( pair(docid,doccnt) );                     strBucket = strBucket.substr(idx+1);               }                 // remember the last one               docid = strBucket;               doccnt = 0;               map::iterator it = mapRstDoc.find(docid);               if ( it != mapRstDoc.end() )               {                   doccnt = (*it).second + 1;                   mapRstDoc.erase(it);               }               mapRstDoc.insert( pair(docid,doccnt) );           }             // sort by term frequencty           multimap > newRstDoc;           map::iterator it0 = mapRstDoc.begin();           for ( ; it0 != mapRstDoc.end(); ++it0 ){               newRstDoc.insert( pair((*it0).second,(*it0).first) );           }             multimap::iterator itNewRstDoc = newRstDoc.begin();           setRelevantRst.clear();           for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){               string docid = (*itNewRstDoc).second;                 if (bFirst==true) {                   setRelevantRst.insert(docid);                   continue;               }                 if ( setSRst.find(docid) != setSRst.end() ){                       setRelevantRst.insert(docid);               }           }             //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";           bFirst = false;       }       return true;   }  /**
             * 程序翻譯說明
             * 相關(guān)性分析查詢,構(gòu)造結(jié)果集合setRelevantRst //瓶頸所在
             *
             * @access  public
             * @param   vector map set 參數(shù)的漢字說明: 用戶提交關(guān)鍵字的分詞組,倒排索引映射,相關(guān)性結(jié)果集合
             * @return  string 0
             */
            bool CQuery::GetRelevantRst
            (
             vector &vecTerm,
             map &mapBuckets,
             set &setRelevantRst
            ) const
            {
             set setSRst;

             bool bFirst=true;
             vector::iterator itTerm = vecTerm.begin();

             for ( ; itTerm != vecTerm.end(); ++itTerm )
             {

              setSRst.clear();
              copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));

              map mapRstDoc;
              string docid;
              int doccnt;

              map::iterator itBuckets = mapBuckets.find(*itTerm);
              if (itBuckets != mapBuckets.end())
              {
               string strBucket = (*itBuckets).second;
               string::size_type idx;
               idx = strBucket.find_first_not_of(" ");
               strBucket = strBucket.substr(idx);

               while ( (idx = strBucket.find(" ")) != string::npos )
               {
                docid = strBucket.substr(0,idx);
                doccnt = 0;

                if (docid.empty()) continue;

                map::iterator it = mapRstDoc.find(docid);
                if ( it != mapRstDoc.end() )
                {
                 doccnt = (*it).second + 1;
                 mapRstDoc.erase(it);
                }
                mapRstDoc.insert( pair(docid,doccnt) );

                strBucket = strBucket.substr(idx+1);
               }

               // remember the last one
               docid = strBucket;
               doccnt = 0;
               map::iterator it = mapRstDoc.find(docid);
               if ( it != mapRstDoc.end() )
               {
                doccnt = (*it).second + 1;
                mapRstDoc.erase(it);
               }
               mapRstDoc.insert( pair(docid,doccnt) );
              }

              // sort by term frequencty
              multimap > newRstDoc;
              map::iterator it0 = mapRstDoc.begin();
              for ( ; it0 != mapRstDoc.end(); ++it0 ){
               newRstDoc.insert( pair((*it0).second,(*it0).first) );
              }

              multimap::iterator itNewRstDoc = newRstDoc.begin();
              setRelevantRst.clear();
              for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){
               string docid = (*itNewRstDoc).second;

               if (bFirst==true) {
                setRelevantRst.insert(docid);
                continue;
               }

               if ( setSRst.find(docid) != setSRst.end() ){ 
                setRelevantRst.insert(docid);
               }
              }

              //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "";
              bFirst = false;
             }
             return true;
            }

            接下來的就是現(xiàn)實(shí)了,前面都只是處理數(shù)據(jù)得到 setRelevantRst 這個(gè)查詢結(jié)構(gòu)集合,這里就不多說了下面就和php之類的腳本語(yǔ)言差不多,格式化結(jié)果集合并顯示出來。
            //TSESearch.cpp

            view plaincopy to clipboardprint?
            //下面開始顯示  
                CDisplayRst iDisplayRst;   
                iDisplayRst.ShowTop();   
             
                float used_msec = (end_tv.tv_sec-begin_tv.tv_sec)*1000   
                    +((float)(end_tv.tv_usec-begin_tv.tv_usec))/(float)1000;   
             
                iDisplayRst.ShowMiddle(iQuery.m_sQuery,used_msec,   
                        setRelevantRst.size(), iQuery.m_iStart);  
             
                iDisplayRst.ShowBelow(vecTerm,setRelevantRst,vecDocIdx,iQuery.m_iStart);

             

            posted on 2009-12-10 22:53 學(xué)者站在巨人的肩膀上 閱讀(993) 評(píng)論(0)  編輯 收藏 引用 所屬分類: 中文文本信息處理
            亚洲成色www久久网站夜月| 色偷偷888欧美精品久久久| 久久久久亚洲AV成人网人人网站| 怡红院日本一道日本久久 | 色偷偷888欧美精品久久久| 久久国产美女免费观看精品| 久久久久久久女国产乱让韩| 久久亚洲欧美日本精品| 久久精品国产亚洲AV久| 91精品观看91久久久久久| 久久精品国产男包| 国产成人综合久久精品尤物| 亚洲中文字幕无码一久久区| 久久精品国产亚洲Aⅴ香蕉| 精品久久8x国产免费观看| 思思久久99热只有频精品66| 91久久九九无码成人网站| 久久久久亚洲av无码专区| 亚洲午夜无码AV毛片久久| 久久综合狠狠综合久久激情 | 亚洲色欲久久久综合网东京热| 久久伊人精品青青草原日本| 日本精品久久久中文字幕| 国产精品久久久久AV福利动漫| 亚洲一级Av无码毛片久久精品| 久久久久久综合一区中文字幕 | 色婷婷综合久久久久中文字幕| 999久久久免费精品国产| 成人综合久久精品色婷婷| 久久久久无码国产精品不卡| 久久se精品一区二区影院| 久久香蕉一级毛片| 精品久久久久久| 久久国产一区二区| 久久久国产精品福利免费| 国产精品久久久久AV福利动漫| 精品国际久久久久999波多野| 久久久国产精品亚洲一区| 久久久久久夜精品精品免费啦| 久久亚洲精品中文字幕| 久久精品中文无码资源站|