关于一个单词统计的报告

首先我先说一下这个题目：写一个程序，分析一个文本文件（英文文章）中各个词出现的频率，并且把频率最高的10个词打印出来。

我的大体计划是：

设计计划	需求分析	框架结构	调试	写报告，提交
时间	2月20号到2月24号	2月25号到2月27号	2月28号上午	3月1号到3月2号

下面是我的详细计划：我们是周3接到了王老师的任务写一份统计文章里的英文单词个数，并把前十个输出出来，其实到周6（2月22号）才开始真正去想这个问题的，周6早上开始提着笔记本想我们的程序，记下来如下：首先我选用的是c语言编写，用文件的形式存储英文文章，用结构体存储单词并计数出现的次数，进行排序，最后输出出来前十个。

首先介绍一下我们的需求分析，用户是需要一个统计单词的个数的程序，需要我们输出前十个频率最高的单词。

框架结构用程序来描述：

FILE *fp=NULL;
fp=fopen("text.txt","r");
if(NULL == fp)
{
    return -1;
}
printf("文章为： ");
while(!feof(fp))         //检查文件是否结束
putchar(getc(fp));
rewind(fp);                 //使文件位置标记返回头文件

上面的程序是我参考c语言课本上的，开始的时候我并没有写上 rewind(fp);使程序总是调试不对，出现如下错误，经过翻看c语言课本，发现了错误。

错误原因：程序开始的时候是运行读出文本里的英文单词，所以那个fp指针已经知道了文件的末尾，所以根本无法再进行下面的读取。rewind(fp);是使指针转向文件的头部。

周日（2月23号）上午开始编写框架结构，统计了一下有几个函数，写结构体，其中包括：

typedef struct WordCount
{
char cWord[20];
int iCount;
}T_WordCount;
int SearchWord(const char *p);//计算单词个数及输出信息等
void LowerText(char *p);//把单词变成小写形式
void SwapItem(T_WordCount *ItemA, T_WordCount * ItemB);//交换两个元素
void SortWord(T_WordCount *pWordSet);//排序

开始的时候我没有加上这个void LowerText(char *p);函数，导致排序总是不对，后来周3（2月26号）下午编程的时候才发现我没有注意大小写，所以加上了这个函数。

周3下午，编写的程序如下：

冒泡排序函数：

void SortWord(T_WordCount *pWordSet){
int i,j;
for (j = 0; j < MAX - 1; j++)
{
for (i = 0; i < MAX - 1 - j; i++)
{
if (pWordSet[i].iCount < pWordSet[i+1].iCount)
{
SwapItem(&pWordSet[i], &pWordSet[i+1]);
}
}
}
}

交换元素的函数如下：

void SwapItem(T_WordCount *ItemA, T_WordCount * ItemB)
{
T_WordCount Tmp;
memset(&Tmp, 0, sizeof(T_WordCount));
strcpy(Tmp.cWord, ItemA->cWord);
Tmp.iCount = ItemA->iCount;
strcpy(ItemA->cWord, ItemB->cWord); ItemA->iCount = ItemB->iCount;
strcpy(ItemB->cWord, Tmp.cWord); ItemB->iCount = Tmp.iCount;
}

把单词编程小写的形式函数如下：

void LowerText(char *p)
{
char *pt = p;
while (*pt != '')
{
if ((*pt >= 'A' && *pt <= 'Z'))
{
*pt += 32 ;
}
pt++; }
}

周四开始编写统计函数：

int SearchWord(const char *p)
{
char c[20] = {0}; //存储单词
int i = 0;
char *pt = c;
int iFlag = 0;
T_WordCount tWordSet[MAX];//定义结构体
memset(tWordSet, 0, sizeof(tWordSet));
while (*p != '')
{
if ((*p >= 'A' && *p <= 'Z') || (*p >= 'a' && *p <= 'z'))
{
*pt = *p;
pt++;
}
else if (*p == '-')
{
++p;
continue;
}
else
{
if (strlen(c) > 0)
{
LowerText(c);
iFlag = 0;
for (i = 0; i < MAX; ++i) //检查当前单词是否与前面的单词相同
{
if (strlen(tWordSet[i].cWord) > 0)
{
if (strcmp(tWordSet[i].cWord, c) == 0)
{
iFlag = 1;
tWordSet[i].iCount++;
break;
}
}
else
{
strcpy(tWordSet[i].cWord, c);
tWordSet[i].iCount = 1;
iFlag = 1;
break;
}
}
if (!iFlag)
{
printf("保存单词的空间不足. ");
}
}
memset(c, 0, 20); //把c置空
pt = c;
}
++p;
}
//排序
SortWord(tWordSet);
for (i = 0; i < 10; ++i)
{
if (strlen(tWordSet[i].cWord) > 0)
{
printf("%s:%d ",tWordSet[i].cWord,tWordSet[i].iCount);
}
}
return 0;
}

整体来说这个数编写起来挺困难的，比较与前面的单词是否相同记录下来，通过flag标志判断。

整个程序如下：

// 统计单词个数（个人项目）.cpp : Defines the entry point for the console application.

#include "stdafx.h"

#include <stdio.h>

#include <string.h>

//能统计的最大单词个数，可以自己改

#define MAX 1000

//结构体，保存每个单词及对应的个数

typedef struct WordCount

{

char cWord[20];

int iCount;

}T_WordCount;

int SearchWord(const char *p);//计算单词个数及输出信息等

void LowerText(char *p);//把单词变成小写形式

void SwapItem(T_WordCount *ItemA, T_WordCount * ItemB);//交换两个元素

void SortWord(T_WordCount *pWordSet);//排序

///////////////////////////////////////////////////

int main(int argc, char *argv[])

{

FILE *fp=NULL;

fp=fopen("text.txt","r");

if(NULL == fp)

{

return -1;

}

printf("文章为： ");

while(!feof(fp)) //检查文件是否结束

putchar(getc(fp));

rewind(fp); //使文件位置标记返回头文件

printf(" ");

printf("出现频率最高的10个单词是: ");

char s[1001]={0};

fread(s, 1, 1000,fp);

SearchWord(s);

return 0;

}

//////////////////////////////////////

int SearchWord(const char *p)

{

char c[20] = {0}; //存储单词

int i = 0;

char *pt = c;

int iFlag = 0;

T_WordCount tWordSet[MAX];//定义结构体

memset(tWordSet, 0, sizeof(tWordSet));

while (*p != '')

{

if ((*p >= 'A' && *p <= 'Z') || (*p >= 'a' && *p <= 'z'))

{

*pt = *p;

pt++;

}

else if (*p == '-')

{

++p;

continue;

}

else

{

if (strlen(c) > 0)

{

LowerText(c);

iFlag = 0;

for (i = 0; i < MAX; ++i) //检查当前单词是否与前面的单词相同

{

if (strlen(tWordSet[i].cWord) > 0)

{

if (strcmp(tWordSet[i].cWord, c) == 0)

{

iFlag = 1;

tWordSet[i].iCount++;

break;

}

else

{

strcpy(tWordSet[i].cWord, c);

tWordSet[i].iCount = 1;

iFlag = 1;

break;

}

if (!iFlag)

{

printf("保存单词的空间不足. ");

}

memset(c, 0, 20); //把c置空

pt = c;

}

++p;

}

//排序

SortWord(tWordSet);

for (i = 0; i < 10; ++i)

{

if (strlen(tWordSet[i].cWord) > 0)

{

printf("%s:%d ",tWordSet[i].cWord,tWordSet[i].iCount);

}

return 0;

}

////////////////////////////////////////////////

void LowerText(char *p)

{

char *pt = p;

while (*pt != '')

{

if ((*pt >= 'A' && *pt <= 'Z'))

{

*pt += 32 ;

}

pt++; }

}

/////////////////////////////////////////////////////////////////

void SwapItem(T_WordCount *ItemA, T_WordCount * ItemB)

{

T_WordCount Tmp;

memset(&Tmp, 0, sizeof(T_WordCount));

strcpy(Tmp.cWord, ItemA->cWord);

Tmp.iCount = ItemA->iCount;

strcpy(ItemA->cWord, ItemB->cWord); ItemA->iCount = ItemB->iCount;

strcpy(ItemB->cWord, Tmp.cWord); ItemB->iCount = Tmp.iCount;

}

//冒泡排序算法

void SortWord(T_WordCount *pWordSet){

int i,j;

for (j = 0; j < MAX - 1; j++)

{

for (i = 0; i < MAX - 1 - j; i++)

{

if (pWordSet[i].iCount < pWordSet[i+1].iCount)

{

SwapItem(&pWordSet[i], &pWordSet[i+1]);

}

调试部分，就是根据一边编程一边调试，最后也就出来了运行结果如下：

文章存储在text.text文件目录下：

英语文章如下：

Whether sixty or sixteen, there is in every human being's heart the lure of wonders, the 1.unfailing childlike appetite of what's next and the joy of the game of living. In the center of your heart and my heart there is a wireless station: so long as it receives messages of beauty, hope, cheer, courage and power from men and from the infinite, so long are you young.

心得总结：感觉这次个人项目非常体现个人的编程水平，虽然本人编程能力不是很强，但通过努力学习了很多的编程经验，考虑到了编程中应该注意的很多问题，还有需求分析设计，比较重要，对后期的维护具有事半功倍的效果。这个单词统计的程序，复习了原来的c语言的知识，文件的读入等，感觉这个很重要。以上是我的个人程序，有什么不足之处，还请指正！