国产精品成人久久久,人人狠狠综合久久亚洲,日韩人妻无码精品久久久不卡

K-近鄰法的實現

K-近鄰法是根據距離最近的 k 個樣例的類型來推測該樣例的類型。
實現中中主要的環節有：
·訓練樣例格式和測試樣例格式的定義
·樣例結構體的定義
·訓練樣例和測試樣例的讀取
·樣例距離的計算，歐氏距離
·距離矩陣的生成
·針對每個測試樣例，得到其到每個訓練樣例的距離，根據距離由小到大排序，更具距離和權重成反比的關系，計算每個類型的總的權重，得到最大權重的那個類型，即是當前測試樣例的類型
·k 值的選擇和設定

訓練樣本的格式是：
每行代表一個樣例，每行的第一個元素是該樣例的類型，后面是該樣例的特征向量
例如：

train.txt
a    1 2 3 4 5
b    5 4 3 2 1
c    3 3 3 3 3
d    -3 -3 -3 -3 -3
a    1 2 3 4 4
b    4 4 3 2 1
c    3 3 3 2 4
d    0 0 1 1 -2

測試樣例的格式是：
每行代表一個樣例，每行即是該樣例的特征向量
例如：

test.txt
1 2 3 2 4
2 3 4 2 1
8 7 2 3 5
-3 -2 2 4 0
-4 -4 -4 -4 -4
1 2 3 4 4
4 4 3 2 1
3 3 3 2 4
0 0 1 1 -2

測試樣例的輸出結果的格式是：
格式與訓練樣例一樣，即每行代表一個樣例，每行的第一個元素是學習到的測試樣例的類型，后面是該樣例的特征向量
例如：

result.txt
a    1 2 3 2 4
b    2 3 4 2 1
b    8 7 2 3 5
a    -3 -2 2 4 0
d    -4 -4 -4 -4 -4
a    1 2 3 4 4
b    4 4 3 2 1
c    3 3 3 2 4
d    0 0 1 1 -2

具體的程序實現如下：

  1 /*
  2     K-近鄰法的實現
  3     mark
  4     goonyangxiaofang@163.com
  5     QQ 591 247 876
  6     2012.02.13
  7 */
  8
  9
10 #include <iostream>
11 #include <string>
12 #include <vector>
13 #include <set>
14 #include <map>
15 #include <fstream>
16 #include <sstream>
17 #include <cassert>
18 #include <cmath>
19 using namespace std;
20
21 // 樣例結構體，所屬類型和特征向量
22 struct sample
23 {
24     string type;
25     vector<double> features;
26 };
27
28 // 類型和距離結構體，未用到
29 struct typeDistance
30 {
31     string type;
32     double distance;
33 };
34
35 bool operator < (const typeDistance& lhs, const typeDistance& rhs)
36 {
37     return lhs.distance < rhs.distance;
38 }
39
40 // 讀取訓練樣本
41 // 訓練樣本的格式是：每行代表一個樣例
42 // 每行的第一個元素是類型名，后面的是樣例的特征向量
43 // 例如：
44 /*
45 a    1 2 3 4 5
46 b    5 4 3 2 1
47 c    3 3 3 3 3
48 d    -3 -3 -3 -3 -3
49 a    1 2 3 4 4
50 b    4 4 3 2 1
51 c    3 3 3 2 4
52 d    0 0 1 1 -2
53 */
54 void readTrain(vector<sample>& train, const string& file)
55 {
56     ifstream fin(file.c_str());
57     if (!fin)
58     {
59         cerr << "File error!" << endl;
60         exit(1);
61     }
62     string line;
63     double d = 0.0;
64     while (getline(fin, line))
65     {
66         istringstream sin(line);
67         sample ts;
68         sin >> ts.type;
69         while (sin >> d)
70         {
71             ts.features.push_back(d);
72         }
73         train.push_back(ts);
74     }
75     fin.close();
76 }
77
78 // 讀取測試樣本
79 // 每行代表一個樣例
80 // 每一行是一個樣例的特征向量
81 // 例如：
82 /*
83 1 2 3 2 4
84 2 3 4 2 1
85 8 7 2 3 5
86 -3 -2 2 4 0
87 -4 -4 -4 -4 -4
88 1 2 3 4 4
89 4 4 3 2 1
90 3 3 3 2 4
91 0 0 1 1 -2
92 */
93 void readTest(vector<sample>& test, const string& file)
94 {
95     ifstream fin(file.c_str());
96     if (!fin)
97     {
98         cerr << "File error!" << endl;
99         exit(1);
100     }
101     double d = 0.0;
102     string line;
103     while (getline(fin, line))
104     {
105         istringstream sin(line);
106         sample ts;
107         while (sin >> d)
108         {
109             ts.features.push_back(d);
110         }
111         test.push_back(ts);
112     }
113     fin.close();
114 }
115
116 // 計算歐氏距離
117 double euclideanDistance(const vector<double>& v1, const vector<double>& v2)
118 {
119     assert(v1.size() == v2.size());
120     double ret = 0.0;
121     for (vector<double>::size_type i = 0; i != v1.size(); ++i)
122     {
123         ret += (v1[i] - v2[i]) * (v1[i] - v2[i]);
124     }
125     return sqrt(ret);
126 }
127
128 // 初始化距離矩陣
129 // 該矩陣是根據訓練樣本和測試樣本而得
130 // 矩陣的行數為測試樣本的數目，列數為訓練樣本的數目
131 void initDistanceMatrix(vector<vector<double> >& dm, const vector<sample>& train, const vector<sample>& test)
132 {
133     for (vector<sample>::size_type i = 0; i != test.size(); ++i)
134     {
135         vector<double> vd;
136         for (vector<sample>::size_type j = 0; j != train.size(); ++j)
137         {
138             vd.push_back(euclideanDistance(test[i].features, train[j].features));
139         }
140         dm.push_back(vd);
141     }
142 }
143
144 // K-近鄰法的實現
145 // 設定不同的 k 值，給每個測試樣例予以一個類型
146 // 距離和權重成反比
147 void knnProcess(vector<sample>& test, const vector<sample>& train, const vector<vector<double> >& dm, unsigned int k)
148 {
149     for (vector<sample>::size_type i = 0; i != test.size(); ++i)
150     {
151         multimap<double, string> dts;
152         for (vector<double>::size_type j = 0; j != dm[i].size(); ++j)
153         {
154             if (dts.size() < k)
155             {
156                 dts.insert(make_pair(dm[i][j], train[j].type));
157             }
158             else
159             {
160                 multimap<double, string>::iterator it = dts.end();
161                 --it;
162                 if (dm[i][j] < it->first)
163                 {
164                     dts.erase(it);
165                     dts.insert(make_pair(dm[i][j], train[j].type));
166                 }
167             }
168         }
169         map<string, double> tds;
170         string type = "";
171         double weight = 0.0;
172         for (multimap<double, string>::const_iterator cit = dts.begin(); cit != dts.end(); ++cit)
173         {
174             // 不考慮權重的情況，在 k 個樣例中只要出現就加 1
175             // ++tds[cit->second];
176
177             // 這里是考慮距離與權重的關系，距離越大權重越小
178             tds[cit->second] += 1.0 / cit->first;
179             if (tds[cit->second] > weight)
180             {
181                 weight = tds[cit->second];
182                 type = cit->second;
183             }
184         }
185         test[i].type = type;
186     }
187 }
188
189 // 輸出結果
190 // 輸出的格式和訓練樣本的格式一樣
191 // 每行表示一個樣例，第一個元素是該樣例的類型，后面是該樣例的特征向量
192 // 例如：
193 /*
194 a    1 2 3 2 4
195 b    2 3 4 2 1
196 b    8 7 2 3 5
197 a    -3 -2 2 4 0
198 d    -4 -4 -4 -4 -4
199 a    1 2 3 4 4
200 b    4 4 3 2 1
201 c    3 3 3 2 4
202 d    0 0 1 1 -2
203 */
204 void writeTest(const vector<sample>& test, const string& file)
205 {
206     ofstream fout(file.c_str());
207     if (!fout)
208     {
209         cerr << "File error!" << endl;
210         exit(1);
211     }
212     for (vector<sample>::size_type i = 0; i != test.size(); ++i)
213     {
214         fout << test[i].type << '\t';
215         for (vector<double>::size_type j = 0; j != test[i].features.size(); ++j)
216         {
217             fout << test[i].features[j] << ' ';
218         }
219         fout << endl;
220     }
221 }
222
223 // 封裝
224 void knn(const string& file1, const string& file2, const string& file3, int k)
225 {
226     vector<sample> train, test;
227     readTrain(train, file1.c_str());
228     readTest(test, file2.c_str());
229     vector<vector<double> > dm;
230     initDistanceMatrix(dm, train, test);
231     knnProcess(test, train, dm, k);
232     writeTest(test, file3.c_str());
233 }
234
235 // 測試
236 int main()
237 {
238     knn("train.txt", "test.txt", "result.txt", 5);
239     return 0;
240 }
241

posted on 2012-02-14 09:47 unixfy 閱讀(4923) 評論(0) 編輯收藏引用

常用鏈接

留言簿(3)

隨筆檔案

搜索

最新評論

閱讀排行榜

評論排行榜

只有注冊用戶登錄后才能發表評論。
【推薦】100%開源！大型工業跨平臺軟件C++源碼提供，建模，組態！



網站導航: 博客園 IT新聞 BlogJava 博問 Chat2DB 管理