要求:讀取一個文本,然后統計里面出現的單詞,打印每個單詞出現的次數。僅僅考慮英文單詞的情形,不考慮中文
小乓練題:
int main(int argc, char* argv[])
{
using namespace std;
ifstream infile("c:\\a.txt",ios::binary );
if(!infile)
{
cout<<"Can not open sourse file!"<<endl;
return 0;
}
//ofstream outfile("out.txt");
//if(!outfile)
//{
// cout<<"Can not open destination file!"<<endl;
//}
int nLength = 0;
char * pBuffer;
// get length of file:
infile.seekg (0, ios::end);
nLength = infile.tellg();
infile.seekg (0, ios::beg);
//read the file to the buffer
pBuffer = new char[nLength];
memset(pBuffer, 0, nLength);
infile.read(pBuffer,nLength);
infile.close();
//copy the buffer to the string s
string s = pBuffer;
delete[] pBuffer;
pBuffer = NULL;
string temp;
vector<string> vecSubstr;
vector<int> vecCount;
int pre=0,next=0;
while(next<nLength)
{
pre=next;
//find the word
while((next<nLength)&&isalnum(s[next]))
{
next++;
}
if(pre!=next)
{
//計算當前的單詞個數
temp = s.substr(pre,next-pre);
cout<<temp<<endl;
//std::vector<std::string>::iterator iter = std::find(vecSubstr.begin(), vecSubstr.end(), temp);
//if (vecSubstr.end() != iter)
//{
// std::cout<<temp<<std::endl;
//}else
//{
// vecSubstr.push_back(temp);
//}
unsigned int iPosition=0;
while(iPosition<vecSubstr.size())
{
if(vecSubstr[iPosition].compare(temp)==0)
break;
iPosition++;
}
if (iPosition==vecSubstr.size())
{
vecSubstr.push_back(temp);
vecCount.push_back(1);
}
else
{
vecCount[iPosition]++;
}
}
next++;
}
for (int j=0;j<vecSubstr.size();j++)
{
cout<<vecSubstr[j]<<endl<<vecCount[j]<<endl;
}
//for(int i=0;i<substr.size();i++)
//{
// cout<<substr[i]<<endl;
// cout<<count[i]<<endl;
//}
//delete[] pBuffer;
//pBuffer = NULL;
system("pause");
return 0;
}
C++代碼:
int main(int argc, char* argv[])
{
// 文件路徑
char* szPath = "C:\\text.txt";
std::ifstream fin(szPath);
if (!fin)
{
std::cout<<"Can not open file"<<std::endl;
return -1;
}
// 通常我們這樣讀取一個文本文件的全文
std::string strText = std::string(std::istreambuf_iterator<char>(fin), std::istreambuf_iterator<char>());
typedef std::map<std::string, int> CountMap;
CountMap counter;
int nLength = strText.length();
int nLeft = 0;
int nRight = -1;
while(nRight<nLength)
{
nLeft = nRight+1;
// 找到第一個是字母的位置
while (nLeft<nLength && !isalnum(strText[nLeft]))
{
++nLeft;
}
nRight = nLeft+1;
// 找到第一個非字母的位置
while (nRight<nLength && isalnum(strText[nRight]))
{
++nRight;
}
// 取nRight-nLeft可以保證取到的是一個word,其中不會含有字符
if (nRight < nLength)
{
// 提取單詞
std::string strWord = strText.substr(nLeft, nRight - nLeft);
// 加入記數器
counter[strWord]+=1;
}
}
// 打印輸出
for (CountMap::iterator iter = counter.begin(); counter.end()!=iter; ++iter)
{
std::cout<<iter->first<<"\t\t"<<iter->second<<std::endl;
}
system("pause");
return 0;
}
python 代碼:
import re
filepath=r'c:/text.txt'
with open(filepath) as file:
text=file.read()
text=re.split('\W+', text)
d={}
for item in text:
d[item]=d.get(item, 0) +1
for key, value in d.items():
print('%s\t\t%s'%(key, value))
小乓加油!