欧美激情久久久久,亚洲午夜在线观看视频在线,国产精品日韩在线观看

字符串相似度算法介紹(整理)(轉(zhuǎn))

最近在做這方面的應(yīng)用，把我找到的資料貼出來，有需要的人可以參考參考。
1．編輯距離（Levenshtein Distance）
編輯距離就是用來計算從原串（s）轉(zhuǎn)換到目標串(t)所需要的最少的插入，刪除和替換
的數(shù)目，在NLP中應(yīng)用比較廣泛，如一些評測方法中就用到了（wer,mWer等），同時也常用來計算你對原文本所作的改動數(shù)。編輯距離的算法是首先由俄國科學(xué)家Levenshtein提出的，故又叫Levenshtein Distance。
Levenshtein Distance算法可以看作動態(tài)規(guī)劃。它的思路就是從兩個字符串的左邊開始比較,記錄已經(jīng)比較過的子串相似度(實際上叫做距離),然后進一步得到下一個字符位置時的相似度。用下面的例子: GUMBO和GAMBOL。當算到矩陣D[3,3]位置時,也就是當比較到GUM和GAM時,要從已經(jīng)比較過的3對子串GU-GAM, GUM-GA和GU-GA之中選一個差別最小的來當它的值. 所以要從左上到右下構(gòu)造矩陣。
編輯距離的偽算法：
整數(shù) Levenshtein距離(字符 str1[1..lenStr1], 字符 str2[1..lenStr2])
   宣告 int d[0..lenStr1, 0..lenStr2]
   宣告 int i, j, cost

   對于 i 等于由 0 至 lenStr1
       d[i, 0] := i
   對于 j 等于由 0 至 lenStr2
       d[0, j] := j
   對于 i 等于由 1 至 lenStr1
       對于 j 等于由 1 至 lenStr2
           若 str1[i] = str2[j] 則 cost := 0
                                否則 cost := 1
           d[i, j] := 最小值(
                                d[i-1, j ] + 1,     // 刪除
                                d[i , j-1] + 1,     // 插入
                                d[i-1, j-1] + cost   // 替換
                            )
返回 d[lenStr1, lenStr2]

double Minimum(double a, double b, double c)
{
double mi;

mi = a;
if (b < mi) {
mi = b;
}
if (c < mi) {
mi = c;
}
return mi;
}

int* GetCellPointer(int *pOrigin, int col, int row, int nCols)
{
return pOrigin + col + (row * (nCols + 1));
}

int GetAt(int *pOrigin, int col, int row, int nCols)
{
int *pCell;

pCell = GetCellPointer (pOrigin, col, row, nCols);
return *pCell;
}

void PutAt(int *pOrigin, int col, int row, int nCols, double x)
{
int *pCell;
pCell = GetCellPointer (pOrigin, col, row, nCols);
*pCell = x;
}

//編輯距離
LD(const char *s, const char *t)
{
int *d; // pointer to matrix
int n; // length of s
int m; // length of t
int i; // iterates through s
int j; // iterates through t
char s_i1; // ith character of s
char s_i2; // ith character of s
char t_j1; // jth character of t
char t_j2; // jth character of t
int *cost; // cost代價矩陣
int result; // result
int cell; // contents of target cell
int above; // contents of cell immediately above
int left; // contents of cell immediately to left
int diag; // contents of cell immediately above and to left
int sz; // number of cells in matrix

// Step 1

n = strlen (s);
m = strlen (t);
if (n == 0)
{
return m;
}
if (m == 0)
{
return n;
}
sz = (n+1) * (m+1) * sizeof (int);
d = (int *) malloc (sz);
cost = (int *) malloc (sz);

// Step 2

for (i = 0; i <= n; i++)
{
PutAt (d, i, 0, n, i);
}

for (j = 0; j <= m; j++)
{
PutAt (d, 0, j, n, j);
}
for (int g=0;g<=m;g++)//把代價距離矩陣全部初始化為同一個值，以后可根據(jù)此值判斷相應(yīng)的方格是否被賦過值
{
for(int h=0;h<=n;h++)
{
PutAt(cost,h,g,n,2);
}
}
// Step 3

for (i = 1; i <= n; i++)
{

s_i1 = s[i-1];
s_i2 = s[i];
bool sbd=false;
bool tbd=false;
if(s_i1>=' '&&s_i1<='@'||s_i1>='A'&&s_i1<='~' )
{//s為標點符號或其他非中文符號和數(shù)字
sbd=true;
}
// Step 4

for (j = 1; j <= m; j++)
{

   tbd=false;
   t_j1 = t[j-1];
   t_j2 = t[j];
   // Step 5
   if(t_j1>=' '&&t_j1<='@'||t_j1>='A'&&t_j1<='~' )
   {//t也為標點符號
    tbd=true;
   }
   if(!sbd)
   {//s為漢字
    if(!tbd)
    {//t也為漢字
     if (s_i1 == t_j1&&s_i2 == t_j2)
     {
      bool tt=false;
      int temp=GetAt(cost,i,j,n);
      if(temp==2)
      {
       PutAt(cost,i,j,n,0);
       tt=true;
      }
      if(tt)
      {//因為st全市漢字，所以把代價矩陣他相鄰的未賦過值的三個格賦值
       int temp1=GetAt(cost,i+1,j,n);
       if(temp1==2)
       {
        PutAt(cost,i+1,j,n,0);
       }
       int temp2=GetAt(cost,i,j+1,n);
       if(temp2==2)
       {
        PutAt(cost,i,j+1,n,0);
       }
       int temp3=GetAt(cost,i+1,j+1,n);
       if(temp3==2)
       {
        PutAt(cost,i+1,j+1,n,0);
       }
      }
     }
     else
     {
      bool tt=false;
      int temp=GetAt(cost,i,j,n);
      if(temp==2)
      {
       PutAt(cost,i,j,n,1);
       tt=true;
      }
      if(tt)
      {
       int temp1=GetAt(cost,i+1,j,n);
       if(temp1==2)
       {
        PutAt(cost,i+1,j,n,1);
       }
       int temp2=GetAt(cost,i,j+1,n);
       if(temp2==2)
       {
        PutAt(cost,i,j+1,n,1);
       }
       int temp3=GetAt(cost,i+1,j+1,n);
       if(temp3==2)
       {
        PutAt(cost,i+1,j+1,n,1);
       }
      }
     }
    }
    else
    {//t為符號
     bool tt=false;
     int temp=GetAt(cost,i,j,n);
     if(temp==2)
     {
      PutAt(cost,i,j,n,1);
      tt=true;
     }
     if(tt)
     {
      int temp1=GetAt(cost,i+1,j,n);
      if(temp1==2)
      {
       PutAt(cost,i+1,j,n,1);
      }
     }

    }

   }
   else
   {//s為符號
    if(!tbd)
    {//t為漢字
     bool tt=false;
     int temp=GetAt(cost,i,j,n);
     if(temp==2)
     {
      PutAt(cost,i,j,n,1);
      tt=true;
     }
     if(tt)
     {
      int temp1=GetAt(cost,i,j+1,n);
      if(temp1==2)
      {
       PutAt(cost,i,j+1,n,1);
      }
     }
    }
    else
    {
     if(s_i1==t_j1)
     {
      int temp=GetAt(cost,i,j,n);
      if(temp==2)
      {
       PutAt(cost,i,j,n,0);
      }
     }
     else
     {
      int temp=GetAt(cost,i,j,n);
      if(temp==2)
      {
       PutAt(cost,i,j,n,1);
      }
     }
    }

}

// Step 6

   above = GetAt (d,i-1,j, n);
   left = GetAt (d,i, j-1, n);
   diag = GetAt (d, i-1,j-1, n);
   int curcost=GetAt(cost,i,j,n);
   cell = Minimum (above + 1, left + 1, diag + curcost);
   PutAt (d, i, j, n, cell);
}
}

// Step 7

result = GetAt (d, n, m, n);
free (d);
return result;

}

2．最長公共子串 (LCS)
LCS問題就是求兩個字符串最長公共子串的問題。解法就是用一個矩陣來記錄兩個字符
串中所有位置的兩個字符之間的匹配情況，若是匹配則為1，否則為0。然后求出對角線最長的1序列，其對應(yīng)的位置就是最長匹配子串的位置.
下面是字符串21232523311324和字符串312123223445的匹配矩陣，前者為X方向的，
后者為Y方向的。不難找到，紅色部分是最長的匹配子串。通過查找位置我們得到最長的匹配子串為：21232
0 0 0 1 0 0 0 1 1 0 0 1 0 0 0
　　0 1 0 0 0 0 0 0 0 1 1 0 0 0 0
　　1 0 1 0 1 0 1 0 0 0 0 0 1 0 0
　　0 1 0 0 0 0 0 0 0 1 1 0 0 0 0
　　1 0 1 0 1 0 1 0 0 0 0 0 1 0 0
　　0 0 0 1 0 0 0 1 1 0 0 1 0 0 0
　　1 0 1 0 1 0 1 0 0 0 0 0 1 0 0
　　1 0 1 0 1 0 1 0 0 0 0 0 1 0 0
　　0 0 0 1 0 0 0 1 1 0 0 1 0 0 0
　　0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
　　0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
　　0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
　　0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
但是在0和1的矩陣中找最長的1對角線序列又要花去一定的時間。通過改進矩陣的生成方式和設(shè)置標記變量，可以省去這部分時間。下面是新的矩陣生成方式：
0 0 0 1 0 0 0 1 1 0 0 1 0 0 0
　　0 1 0 0 0 0 0 0 0 2 1 0 0 0 0
　　1 0 2 0 1 0 1 0 0 0 0 0 1 0 0
　　0 2 0 0 0 0 0 0 0 1 1 0 0 0 0
　　1 0 3 0 1 0 1 0 0 0 0 0 1 0 0
　　0 0 0 4 0 0 0 2 1 0 0 1 0 0 0
　　1 0 1 0 5 0 1 0 0 0 0 0 2 0 0
　　1 0 1 0 1 0 1 0 0 0 0 0 1 0 0
　　0 0 0 2 0 0 0 2 1 0 0 1 0 0 0
　　0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
　　0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
　　0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
　　0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
當字符匹配的時候，我們并不是簡單的給相應(yīng)元素賦上1，而是賦上其左上角元素的值加一。我們用兩個標記變量來標記矩陣中值最大的元素的位置，在矩陣生成的過程中來判斷當前生成的元素的值是不是最大的，據(jù)此來改變標記變量的值，那么到矩陣完成的時候，最長匹配子串的位置和長度就已經(jīng)出來了。

//最長公共子串
char* LCS(char*left,char* right){
int lenLeft,lenRight;
lenLeft = strlen(left);
lenRight = strlen(right);
int *c = new int[lenRight];
int start,end,len;
end = len = 0;
for(int i = 0; i < lenLeft; i++){
for(int j = lenRight-1; j >= 0; j--){
   if(left[i] == right[j]){
    if(i == 0 || j == 0)
     c[j] = 1;
    else
     c[j] = c[j-1]+1;
   }
   else
    c[j] = 0;
   if(c[j] > len){
    len = c[j];
    end = j;
   }
}
}
char *p = new char[len+1];
start = end - len + 1;
for(i = start; i <= end; i++)
p[i - start] = right[i];
p[len] = '/0';
return p;
}
3. 余弦定理 (向量空間算法)
余弦定理古老而廣泛的數(shù)學(xué)概念，在各個學(xué)科及實踐中都得到了大量的應(yīng)用，這里簡單的介紹下其在判斷兩個字符串相似度的應(yīng)用。
在余弦定理中基本的公式為：

假如字符串s1與s2，比較兩個字符串的相似度，sim(s1,s2)，假設(shè)s1,s2中含有n個不同的字符，其分別為c1,c2,...cn，判斷字符串的相似度轉(zhuǎn)換為兩個字符串對應(yīng)的向量v1,v2之間夾角大小的判斷，余弦值越大其向量之間的夾角越小，s1與S2的相似度越大。
向量空間算法的介紹：
在向量空間模型中，文本泛指各種機器可讀的記錄。用D（Document）表示，特征項（Term，用t表示）是指出現(xiàn)在文檔D中且能夠代表該文檔內(nèi)容的基本語言單位，主要是由詞或者短語構(gòu)成，文本可以用特征項集表示為D(T1，T2，…，Tn)，其中Tk是特征項，1<=k<=N。例如一篇文檔中有a、b、c、d四個特征項，那么這篇文檔就可以表示為D(a，b，c，d)。對含有n個特征項的文本而言，通常會給每個特征項賦予一定的權(quán)重表示其重要程度。即D＝D(T1，W1；T2，W2；…，Tn，Wn)，簡記為D＝D(W1，W2，…，Wn)，我們把它叫做文本D的向量表示。其中Wk是Tk 的權(quán)重，1<=k<=N。在上面那個例子中，假設(shè)a、b、c、d的權(quán)重分別為30，20，20，10，那么該文本的向量表示為 D(30，20，20，10)。在向量空間模型中，兩個文本D1和D2之間的內(nèi)容相關(guān)度Sim(D1，D2)常用向量之間夾角的余弦值表示，公式為：

其中，W1k、W2k分別表示文本D1和D2第K個特征項的權(quán)值，1<=k<=N。我們可以利用類似的方法來計算兩個字符串的相關(guān)度。
這個算法網(wǎng)上沒找到，雖然我寫過，但是沒什么通用性，就不貼出來。很簡單的，有興趣的可以自己寫一個。

posted on 2012-04-01 07:52 Snape 閱讀(664) 評論(0) 編輯收藏引用所屬分類: C++ 轉(zhuǎn)載

只有注冊用戶登錄后才能發(fā)表評論。


相關(guān)文章: 深入理解C語言(轉(zhuǎn)coolshell) 程序員的奮斗史（一）——淺談幾種主要編程語言(轉(zhuǎn)) 類間關(guān)系（轉(zhuǎn)）教你文本聚類（轉(zhuǎn)）字符串相似度算法介紹(整理)(轉(zhuǎn)) 【轉(zhuǎn)】面向?qū)ο笤O(shè)計中的經(jīng)驗原則關(guān)于默認構(gòu)造函數(shù)的幾個錯誤認識（四種情況下，編譯器會生成默認構(gòu)造函數(shù)）如何學(xué)好C++語言（轉(zhuǎn)）一個類的實例化對象所占空間的大小 extern "C"

網(wǎng)站導(dǎo)航: 博客園 IT新聞 BlogJava 博問 Chat2DB 管理

青青草原综合久久大伊人导航_色综合久久天天综合_日日噜噜夜夜狠狠久久丁香五月_热久久这里只有精品

chenglong7997

字符串相似度算法介紹(整理)(轉(zhuǎn))

導(dǎo)航

統(tǒng)計

常用鏈接

留言簿

隨筆分類

隨筆檔案

文章分類

文章檔案

my

搜索

最新評論

閱讀排行榜

評論排行榜