data filter 去掉HTML文件中的所有标记

编写一个C++程序来读取文件,过滤掉所有的标记,将过滤掉标记后的内容输出到一个新文件中。

1. 从文件中读取一个字符

2. 确定字符是否是HTML标记的一部分

3. 打印出所有不是HTML标记的字符

/* --------------------------------------------
 * This program reads a html file, and writes
 * the text without the tags to a new file.
 * --------------------------------------------*/

#include <iostream> // Required for cin, cout, cerr
#include <fstream>  // Required for ifstream, ofstream
#include <string>   // Required for string
#include <cstdlib>  // Required for exit

using namespace std;

int main()
{
    // Declare objects
    char ch;
    bool text_state(true);
    string infile, outfile;
    ifstream html;
    ofstream htmltext;

    // Prompt user for name of input file
    cout << "Enter the name of the input file : 
( *.*, such as : demo.html ) 
" ;
    cout << "Make sure the file is under current project file ! 
" ;
    // My English is poor ~~
    cin >> infile;

    cout<< "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
" ;
    // Prompt user for name of output file
    cout << "Enter the name of the output file :  " ;
    cin >> outfile;

    // Open files
    html.open(infile.c_str());
    if(html.fail())
    {
        cout<< "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
" ;
        cerr << "Error opening input file" << endl ;
        exit(1);
    }
    htmltext.open(outfile.c_str());

    // Read first character from html file
    html.get(ch);

    while(!html.eof())
    {
        // Check state
        if(text_state)
        {
            if(ch == '<')
                // Beginning of a tag
                text_state = false;
                // Change states

            else
                htmltext << ch;
                // Still text, write to the file
        }
        else
        {
            // Command state, no output required
            if(ch == '>')
            // End of tag
                text_state = true;
            // Change states
        }

        // Read next character from html file
        html.get(ch);
    }
    html.close();
    htmltext.close();
    cout<< "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
" ;
    cout << "Success transformed ! 
" ;
    cout << "Look for " << outfile << " in current file.
" ;
    cout<< "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
" ;

    return 0;
}

之后就可以拿个HTML文件试试了,不过这个程序只是把所有标记过滤掉,还有待完善。如果非标记字符有很多无关内容,效果就差强人意。建议用典型的HTML文件测试,如:

<html>

<head>
<title>我的第一个 HTML 页面</title>
</head>

<body>
<p>body 元素的内容会显示在浏览器中。</p>
<p>title 元素的内容会显示在浏览器的标题栏中。</p>
</body>

</html>


原文地址:https://www.cnblogs.com/Genesis2018/p/9079829.html