CPage

  1 #ifndef _Page_H_030728_
  2 #define _Page_H_030728_
  3 
  4 #include <string>
  5 #include <map>
  6 #include <vector>
  7 #include <list>
  8 #include "Url.h"
  9 #include "list.h"
 10 #include "uri.h"
 11 #include "hlink.h"
 12 
 13 
 14 //large enough to hold sina's 437 links 
 15 
 16 const int ANCHOR_TEXT_LEN       = 256;
 17 const int MAX_URL_REFERENCES    = 1000;
 18 const int URL_REFERENCE_LEN     = (URL_LEN+ANCHOR_TEXT_LEN)*MAX_URL_REFERENCES*1/2 ;
 19 const int MAX_TAG_NUMBERS    = 10000;
 20 
 21 using namespace std;
 22 
 23 // plain text or other
 24 enum page_type {
 25     PLAIN_TEXT,
 26     OTHER    
 27 };
 28 
 29 struct RefLink4SE // <href src...>, <area src...>
 30 {
 31     char *link;
 32     char *anchor_text;
 33     string strCharset;
 34 };
 35 
 36 struct RefLink4History    // <img src...>,<script src...>
 37 {
 38     char *link;
 39 };
 40 
 41 class CPage
 42 {
 43 public:
 44     // url & location
 45     string m_sUrl;        //网页对应的URL字符串
 46 
 47     // header
 48     string m_sHeader;//网页头信息
 49     int m_nLenHeader;//网页头信息的长度
 50 
 51     int m_nStatusCode;//状态码
 52     int m_nContentLength;;//从网页头信息中提取的网页体的长度,一般不是很准
 53     string m_sLocation;//网页的转向信息,可以判断这个网页是否重定向
 54     bool m_bConnectionState;    //是否支持持续链接Keep-Alive为true否则为false
 55     string m_sContentEncoding;//网页体的编码
 56     string m_sContentType;//网页体的类型
 57     string m_sCharset;//网页体的字符集
 58     string m_sTransferEncoding;//网页体的传输编码方式
 59 
 60     // content
 61     string m_sContent;//网页体信息
 62     int m_nLenContent;//网页体信息的长度
 63     string m_sContentNoTags;
 64 
 65 
 66     // link, in a lash-up state
 67     string m_sContentLinkInfo;
 68     //从网页体中提取出包含超链接信息的标识,例如<img src="www.baidu.com"/> ,
 69     //<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area>
 70 
 71     // links for SE, in a lash-up state
 72     string m_sLinkInfo4SE;
 73     //再从m_sContentLinkInfo提取出<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area>标识信息
 74     int m_nLenLinkInfo4SE;;//m_sLinkInfo4SE的长度
 75 
 76     // links for history archiving, in a lash-up state
 77     string m_sLinkInfo4History;//再从m_sContentLinkInfo提取出<img src="www.baidu.com">标识信息
 78     int m_nLenLinkInfo4History;//m_sLinkInfo4History的长度
 79 
 80 
 81     // links for SE, in a good state
 82     RefLink4SE m_RefLink4SE[MAX_URL_REFERENCES];//保存URL信息<-->URL的描述信息[这里URL指的是为搜索准备的链接] 即每个网页最多能保存1000个链接
 83     int m_nRefLink4SENum;//上面数组的长度
 84 
 85     // links for history archiving, in a good state
 86     RefLink4History m_RefLink4History[MAX_URL_REFERENCES/2];//保存URL信息[这个URL指的是为历史网页存档准备的链接]
 87     int m_nRefLink4HistoryNum;//上面数组的长度
 88 
 89     //map<string,string,less<string> > m_mapLink4SE;
 90     map<string,string> m_mapLink4SE;//保存URL信息<-->URL的描述信息[这里URL指的是为搜索准备的链接]
 91                                     //-----当然了这个map容器的作用主要是删除一个网页中相同的URL
 92     vector<string > m_vecLink4History;//保存URL信息--当然了这个vector容器的作用主要是删除一个网页中相同的URL
 93 
 94     // page type
 95     enum page_type m_eType;//网页的类型
 96 
 97     // parsed url lists
 98     //list<string>    m_listLink4SE;
 99 
100 public:
101     CPage();
102     CPage(string strUrl, string strLocation, char* header, char* body, int nLenBody);
103     ~CPage();
104 
105     // parse header information from the header content
106     void ParseHeaderInfo(string header);//解析网页头信息
107 
108     // parse hyperlinks from the page content
109     bool ParseHyperLinks();//从网页中提取出链接信息
110 
111     bool NormalizeUrl(string& strUrl);//判断strUrl是不是正规的url
112 
113     bool IsFilterLink(string plink);//判断plink链接是不是要过滤掉
114 
115 private:
116     // parse header information from the header content
117     void GetStatusCode(string header);//得到状态码
118     void GetContentLength(string header);//从网页头信息中提取的网页体的长度,一般不是很准
119     void GetConnectionState(string header);//得到连接状态
120     void GetLocation(string header);//得到重定向信息
121     void GetCharset(string header);//得到字符集
122     void GetContentEncoding(string header);//得到网页体编码
123     void GetContentType(string header);//得到网页体类型
124     void GetTransferEncoding(string header);//得到网页体的传输编码方式
125 
126     // parse hyperlinks from the web page
127     bool GetContentLinkInfo();//从网页体中提取出包含超链接信息的标识,
128                               //例如<img src="www.baidu.com"/> ,<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area>
129 
130 
131     bool GetLinkInfo4SE();//再从m_sContentLinkInfo提取出<a href="www.baidu.com">百度</a> <area href="www.baidu.com">百度</area>标识信息
132     bool GetLinkInfo4History();//再从m_sContentLinkInfo提取出<img src="www.baidu.com">标识信息
133     bool FindRefLink4SE();//最终得到为搜索引擎准备的超链接
134     bool FindRefLink4History();//最终得到为历史网页存档准备的超链接
135 
136 };
137 
138 #endif /* _Page_H_030728_ */
   1 /*Page handling
   2  */
   3 
   4 #include <iostream>
   5 #include <string>
   6 #include <cstring>
   7 #include <map>
   8 #include <vector>
   9 #include <iterator>
  10 #include "Url.h"
  11 #include "Page.h"
  12 #include "StrFun.h"
  13 
  14 
  15 //带参构造函数
  16 CPage::CPage()
  17 {
  18     //初始化成员变量
  19     m_nStatusCode = 0;
  20     m_nContentLength = 0;
  21     m_sLocation = "";
  22     m_bConnectionState = false;
  23     m_sContentEncoding = "";
  24     m_sContentType = "";
  25     m_sCharset = "";
  26     m_sTransferEncoding = "";
  27 
  28     m_sContentLinkInfo = "";
  29     m_sLinkInfo4SE = "";
  30     m_sLinkInfo4History = "";
  31 
  32     m_sContentNoTags = "";
  33     m_nRefLink4SENum = 0;
  34     m_nRefLink4HistoryNum = 0;
  35     m_eType = PLAIN_TEXT;
  36 
  37 
  38     //超链接信息以及超链接的描述信息初始化都为空
  39     for(int i=0; i< MAX_URL_REFERENCES; i++ ){
  40         m_RefLink4SE[i].link = NULL;
  41         m_RefLink4SE[i].anchor_text = NULL;
  42         m_RefLink4SE[i].strCharset = "";
  43 
  44         if(i < MAX_URL_REFERENCES/2){
  45             m_RefLink4History[i].link = NULL;
  46         }
  47     }
  48 
  49 }
  50 
  51 CPage::CPage( string strUrl, string strLocation, char* header, char* body, int nLenBody)
  52 {
  53     //assert( header != NULL );
  54     //assert( body != NULL );
  55     //assert( nLenBody > 0 );
  56 
  57     // CPage();
  58     m_nStatusCode = 0;
  59     m_nContentLength = 0;
  60     m_sLocation = "";
  61     m_bConnectionState = false;
  62     m_sContentEncoding = "";
  63     m_sContentType = "";
  64     m_sCharset = "";
  65     m_sTransferEncoding = "";
  66 
  67     m_sContentLinkInfo = "";
  68     m_sLinkInfo4SE = "";
  69     m_sLinkInfo4History = "";
  70 
  71     m_sContentNoTags = "";
  72     m_nRefLink4SENum = 0;
  73     m_nRefLink4HistoryNum = 0;
  74     m_eType = PLAIN_TEXT;
  75 
  76     //超链接信息以及超链接的描述信息初始化都为空
  77     for(int i=0; i< MAX_URL_REFERENCES; i++ ){
  78         m_RefLink4SE[i].link = NULL;
  79         m_RefLink4SE[i].anchor_text = NULL;
  80         m_RefLink4SE[i].strCharset = "";
  81 
  82         if(i < MAX_URL_REFERENCES/2){
  83             m_RefLink4History[i].link = NULL;
  84         }
  85     }
  86 
  87     //将构造函数传入的参数赋值给成员变量
  88     m_sUrl = strUrl;//网页对应的URL
  89     m_sLocation = strLocation;//网页重定向的URL,没有重定向则传入为空,否则传入重定向的URL信息
  90     m_sHeader = header;//网页的头信息
  91     m_nLenHeader = strlen(header);//网页头信息的长度
  92 
  93     m_sContent.assign(body, nLenBody);//网页体信息,用body所指向数组的前nLenBody个字符副本替换m_sContent
  94     m_nLenContent = nLenBody;//网页体信息的长度
  95 
  96 }
  97 
  98 CPage::~CPage()
  99 {
 100 }
 101 
 102 
 103 //解析网页头信息---调用8个私有的成员函数
 104 void CPage::ParseHeaderInfo(string strHeader)
 105 {
 106     GetStatusCode(strHeader);
 107     GetContentLength(strHeader);
 108     GetLocation(strHeader);
 109     GetConnectionState(strHeader);
 110 
 111     GetCharset(strHeader);
 112 
 113     GetContentEncoding(strHeader);
 114     GetContentType(strHeader);
 115     GetTransferEncoding(strHeader);
 116 }
 117 
 118 //得到状态码
 119 void CPage::GetStatusCode(string headerBuf)
 120 {
 121     //例如:
 122 
 123     //HTTP/1.0 200 OK     200就是状态码
 124     CStrFun::Str2Lower( headerBuf, headerBuf.length() );
 125 
 126     char *charIndex = strstr(headerBuf.c_str(), "http/");//在字符串headerBuf中查找第一出现"http/"的位置
 127     if (charIndex == NULL)
 128     {
 129         m_nStatusCode = -1;
 130         return;
 131     }
 132     //吃掉所有无关的字符
 133     while(*charIndex != ' '){
 134         charIndex++;
 135     }
 136     charIndex++;
 137     
 138     int ret = sscanf(charIndex, "%i", &m_nStatusCode);//格式化字符串输入
 139     if (ret != 1)  m_nStatusCode = -1;
 140 }
 141 
 142 
 143 
 144 //从网页头信息中提取的网页体的长度,一般不是很准
 145 void CPage::GetContentLength(string headerBuf)
 146 {
 147     //例如:
 148 
 149     //content-length: 21237     21237就是网页体的长度,这个属性值是服务器返回的,不一定正确
 150     CStrFun::Str2Lower( headerBuf, headerBuf.length() );
 151 
 152     char *charIndex = strstr(headerBuf.c_str(), "content-length");
 153     if (charIndex == NULL) return;
 154 
 155     while(*charIndex != ' '){
 156         charIndex++;
 157     }
 158     charIndex++;
 159     
 160     int ret = sscanf(charIndex, "%i", &m_nContentLength);
 161     if (ret != 1)  m_nContentLength = -1;
 162 }
 163 
 164 
 165 //得到重定向信息
 166 void CPage::GetLocation(string headerBuf)
 167 {
 168     string::size_type pre_idx,idx;
 169     const string delims("\r\n");
 170 
 171     string strBuf =  headerBuf;
 172     CStrFun::Str2Lower( headerBuf, headerBuf.length() );
 173 
 174     idx = headerBuf.find("location:");
 175     if (idx != string::npos)//若找到
 176     {
 177         pre_idx = idx + sizeof("location: ") -1;
 178         idx = headerBuf.find_first_of(delims, pre_idx );//查找换行符
 179         if (idx != string::npos)
 180         {
 181             //m_sLocation = headerBuf.substr(pre_idx, idx - pre_idx);
 182             m_sLocation = strBuf.substr(pre_idx, idx - pre_idx);
 183         }
 184     }
 185 }
 186 
 187 
 188 //得到网页字符集
 189 void CPage::GetCharset(string headerBuf)
 190 {
 191     string::size_type pre_idx,idx;
 192     const string delims(" \",;>");
 193 
 194     CStrFun::Str2Lower(headerBuf, headerBuf.size());
 195 
 196     idx = headerBuf.find("charset=");
 197     if( idx != string::npos) {
 198         m_sCharset = headerBuf.substr(idx + sizeof("charset=") -1);//保存从charset=开始的所有字符串
 199     }
 200 
 201     headerBuf = m_sContent;
 202     headerBuf = headerBuf.substr(0,2024) ;
 203     CStrFun::Str2Lower( headerBuf, headerBuf.length() );
 204     idx = headerBuf.find("charset=");
 205     if (idx != string::npos)//后边有可能有多余的信息
 206     {
 207         pre_idx = idx + sizeof("charset=") -1;
 208         idx = headerBuf.find_first_of(delims, pre_idx );
 209         if(idx != string::npos){
 210             m_sCharset = headerBuf.substr(pre_idx, idx - pre_idx);
 211         }
 212     }
 213 }
 214 
 215 
 216 //得到网页体编码
 217 void CPage::GetContentEncoding(string headerBuf)
 218 {
 219     string::size_type pre_idx,idx;
 220     const string delims("\r\n");
 221 
 222     CStrFun::Str2Lower( headerBuf, headerBuf.length() );
 223 
 224     idx = headerBuf.find("content-encoding:");
 225     if (idx != string::npos)
 226     {
 227         pre_idx = idx + sizeof("content-encoding: ") -1;
 228         idx = headerBuf.find_first_of(delims, pre_idx );
 229         if (idx != string::npos)
 230         {
 231             m_sContentEncoding = headerBuf.substr(pre_idx, idx - pre_idx);
 232         }
 233     }
 234 }
 235 
 236 //得到连接状态
 237 void CPage::GetConnectionState(string headerBuf)
 238 {
 239     string::size_type pre_idx,idx;
 240     const string delims(";\r\n");
 241 
 242     CStrFun::Str2Lower( headerBuf, headerBuf.length() );
 243 
 244     idx = headerBuf.find("connection:");
 245     if (idx != string::npos)
 246     {
 247         pre_idx = idx + sizeof("connection: ") -1;
 248         idx = headerBuf.find_first_of(delims, pre_idx );
 249         if (idx != string::npos)
 250         {
 251             string str = headerBuf.substr(pre_idx, idx - pre_idx);
 252             //cout << "Connection state: " << str << endl;
 253             //if (str == "close") m_bConnectionState = false;
 254             if (str == "keep-alive") m_bConnectionState = true;
 255         }
 256     }
 257 }
 258 
 259 //得到网页体类型
 260 void CPage::GetContentType(string headerBuf)
 261 {
 262     string::size_type pre_idx,idx;
 263     const string delims(";\r\n");
 264 
 265     CStrFun::Str2Lower( headerBuf, headerBuf.size() );
 266 
 267     idx = headerBuf.find("content-type:");
 268     if (idx != string::npos)
 269     {
 270         pre_idx = idx + sizeof("content-type: ") -1;
 271         idx = headerBuf.find_first_of(delims, pre_idx );
 272         if (idx != string::npos)
 273         {
 274             m_sContentType = headerBuf.substr(pre_idx, idx - pre_idx);
 275         }
 276     }
 277 }
 278 
 279 //得到网页体的传输编码方式
 280 void CPage::GetTransferEncoding(string headerBuf)
 281 {
 282     string::size_type pre_idx,idx;
 283     const string delims(";\r\n");
 284 
 285     CStrFun::Str2Lower( headerBuf, headerBuf.size() );
 286 
 287     idx = headerBuf.find("transfer-encoding:");
 288     if ( idx != string::npos)
 289     {
 290         pre_idx = idx + sizeof("transfer-encoding: ") -1;
 291         idx = headerBuf.find_first_of(delims, pre_idx );
 292         if(idx != string::npos)
 293         {
 294             m_sTransferEncoding = headerBuf.substr(pre_idx, idx - pre_idx);
 295         }
 296     }
 297 }
 298 
 299 /*
 300  * Filter spam links
 301  * If it is, return ture; otherwise false
 302  */
 303 //判断一个URL是不是应该过滤,要过滤返回true否则返回false
 304 bool CPage::IsFilterLink(string plink)
 305 {
 306     if( plink.empty() ) return true;
 307     if( plink.size() > URL_LEN ) return true;
 308 
 309     string link = plink, tmp;
 310     string::size_type idx = 0;
 311 
 312     
 313     CStrFun::Str2Lower( link, link.length() );//link字符串中的字母全部变成小写
 314 
 315     // find two times following symbols, return false
 316     tmp = link;
 317     idx = tmp.find("?");//URL中出现2个'?'字符要过滤
 318     if( idx != string::npos ){
 319         tmp = tmp.substr(idx+1);
 320         idx = tmp.find("?");
 321         if( idx != string::npos ) return true;
 322     }
 323 
 324     tmp = link;//先后出现'-'和'+'字符要过滤
 325     idx = tmp.find("-");
 326     if( idx != string::npos ){
 327         tmp = tmp.substr(idx+1);
 328         idx = tmp.find("+");
 329         if( idx != string::npos ) return true;
 330     }
 331 
 332     //出现2个'&'字符要过滤
 333     tmp = link;
 334     idx = tmp.find("&");
 335     if( idx != string::npos ){
 336         tmp = tmp.substr(idx+1);
 337         idx = tmp.find("&");
 338         if( idx != string::npos ) return true;
 339     }
 340 
 341     //出现2个"//"字符要过滤
 342     tmp = link;
 343     idx = tmp.find("//");
 344     if( idx != string::npos ){
 345         tmp = tmp.substr(idx+1);
 346         idx = tmp.find("//");
 347         if( idx != string::npos ) return true;
 348     }
 349 
 350     //出现2个"http"要过滤
 351     tmp = link;
 352     idx = tmp.find("http");
 353     if( idx != string::npos ){
 354         tmp = tmp.substr(idx+1);
 355         idx = tmp.find("http");
 356         if( idx != string::npos ) return true;
 357     }
 358 
 359     //出现2个"misc"要过滤
 360     tmp = link;
 361     idx = tmp.find("misc");
 362     if( idx != string::npos ){
 363         tmp = tmp.substr(idx+1);
 364         idx = tmp.find("misc");
 365         if( idx != string::npos ) return true;
 366     }
 367 
 368     //出现2个"ipb"要过滤
 369     tmp = link;
 370     idx = tmp.find("ipb");
 371     if( idx != string::npos ){
 372         tmp = tmp.substr(idx+1);
 373         idx = tmp.find("ipb");
 374         if( idx != string::npos ) return true;
 375     }
 376 
 377     const char *filter_str[]={
 378     "cgi-bin",    "htbin",    "linder",    "srs5",        "uin-cgi",  // robots.txt of http://www.expasy.org/
 379     "uhtbin",    "snapshot",    "=+",        "=-",        "script",
 380     "gate",        "search",    "clickfile",    "data/scop",    "names",
 381     "staff/",    "enter",    "user",        "mail",    "pst?",
 382     "find?",    "ccc?",        "fwd?",        "tcon?",    "&amp",
 383     "counter?",    "forum",    "cgisirsi",    "{",        "}",
 384     "proxy",    "login",    "00.pl?",    "sciserv.pl",    "sign.asp",
 385     "<",        ">",        "review.asp?",    "result.asp?",    "keyword",
 386     "\"",        "'",        "php?s=",    "error",    "showdate",
 387     "niceprot.pl?",    "volue.asp?id",    ".css",        ".asp?month",    "prot.pl?",
 388     "msg.asp",    "register.asp", "database",    "reg.asp",    "qry?u",
 389     "p?msg",    "tj_all.asp?page", ".plot.",    "comment.php",    "nicezyme.pl?",
 390     "entr",        "compute-map?", "view-pdb?",    "list.cgi?",    "lists.cgi?",
 391     "details.pl?",    "aligner?",    "raw.pl?",    "interface.pl?","memcp.php?",
 392     "member.php?",    "post.php?",    "thread.php",    "bbs/",        "/bbs"
 393     };
 394     int filter_str_num = 75;
 395 
 396     //说明找到了上述字符串要过滤
 397     for(int i=0; i<filter_str_num; i++){
 398         if( link.find(filter_str[i]) != string::npos)
 399         return true;
 400     }    
 401 
 402     return false;
 403 }
 404 
 405 /////////////////////////////
 406 // just for ImgSE
 407 // e.g: http://www.people.com.cn/GB/tupian/index.html
 408 //     http://news.xinhuanet.com/photo/
 409 //     http://photo.tom.com/
 410 /////////////////////////////
 411 // comment previous one and open this one
 412 
 413 /*
 414 bool CPage::IsFilterLink(string plink)
 415 {
 416     if( plink.empty() ) return true;
 417     if( plink.size() > URL_LEN ) return true;
 418 
 419     return false;
 420 
 421     string link = plink, tmp;
 422     string::size_type idx = 0;
 423 
 424     
 425     CStrFun::Str2Lower( link, link.length() );
 426 
 427     const char *filter_str[]={
 428         "tupian", "photo", "ttjstk"
 429         };
 430     int filter_str_num = 3;
 431 
 432     CStrFun::Str2Lower( link, link.length() );
 433 
 434     for(int i=0; i<filter_str_num; i++){
 435         if( link.find(filter_str[i]) != string::npos)
 436         return false;
 437     }    
 438 
 439     return true;
 440 }
 441 */
 442 
 443 
 444 /*****************************************************************
 445 ** Function name: ParseHyperLinks
 446 ** Input argv:
 447 **      --
 448 ** Output argv:
 449 **      --
 450 ** Return:
 451         true: success
 452         false: fail
 453 ** Function Description:  Parse hyperlinks from the web page
 454 ** Version: 1.0
 455 ** Be careful:
 456 *****************************************************************/
 457 bool CPage::ParseHyperLinks()
 458 {
 459     if( GetContentLinkInfo() == false ) return false;
 460 
 461     if( m_sContentLinkInfo.empty() ) return false;
 462 
 463     bool bFind4SE = false;
 464     bool bFind4History = false;
 465     if( GetLinkInfo4SE() ){
 466         if( FindRefLink4SE() ) bFind4SE = true;
 467     } 
 468 
 469     if( GetLinkInfo4History() ){
 470         if( FindRefLink4History() ) bFind4History = true;
 471     }
 472 
 473     //如果没有从网页中提取出为搜索引擎或者为历史网页存档准备的超链接则返回false
 474     if( !bFind4SE && !bFind4History ){
 475          return false;
 476     }
 477 
 478     //return   GetHref(m_sContentLinkInfo.c_str(), "href", m_listLink4SE);
 479 
 480     return true;
 481 }
 482 
 483 
 484 /*****************************************************************
 485 ** Function name: GetContentLinkInfo
 486 ** Input argv:
 487 **      --
 488 ** Output argv:
 489 **      --
 490 ** Return:
 491         true: success
 492         false: fail
 493 ** Function Description:  Parse hyperlinks from the web page
 494 ** Version: 1.0
 495 ** Be careful:
 496 *****************************************************************/
 497 
 498 //从网页体中提取出包含超链接信息的标识
 499 bool CPage::GetContentLinkInfo()
 500 {
 501     if( m_sContent.empty() ) return false;
 502     
 503     m_sContentLinkInfo = m_sContent;
 504 
 505     string& s = m_sContentLinkInfo; //引用调用
 506 
 507     // transform all separation into one space character
 508     //CStrFun::ReplaceStr(s, "\t", " ");
 509     //CStrFun::ReplaceStr(s, "\r", " ");
 510     //CStrFun::ReplaceStr(s, "\n", " ");
 511     const string delims(" \t\r\n");
 512     string::size_type idx=0, pre_idx;
 513     
 514     //找到所有的"\t\r\n"并将'\t'替换为' ' 如果是\t\t\r\n则删除一个\t
 515     while( (idx = s.find_first_of(delims, idx)) != string::npos )
 516     {
 517         pre_idx = idx;
 518         s.replace(idx,1,1,' ');
 519         idx++;
 520         
 521         while( (idx = s.find_first_of(delims, idx)) != string::npos )
 522         {
 523             if( idx-pre_idx == 1 ){
 524                 s.erase(idx, 1);
 525             } else {
 526                 break;
 527             }
 528         }
 529 
 530         idx--;
 531     }
 532 
 533     // transform all "<br>" into one space character
 534     //将s中<br>标记全部替换为空格
 535     CStrFun::ReplaceStr(s, "<br>", " ");
 536 
 537     if( s.size() < 20 ) return false;
 538 
 539     // Keep only <img ...>, <area ...>,<script ...> and <a href ...> tags.
 540     string::size_type idxHref=0,idxArea=0,idxImg=0;
 541     string dest;
 542 
 543     do{
 544         if( s.empty() ) break;
 545 
 546         idxHref = CStrFun::FindCase(s, "href");
 547         idxArea = CStrFun::FindCase(s, "<area");
 548         idxImg = CStrFun::FindCase(s, "<img");
 549 
 550         pre_idx = idxHref > idxArea? idxArea: idxHref;
 551         pre_idx = idxImg > pre_idx? pre_idx: idxImg;
 552         if( pre_idx == string::npos) break;
 553 
 554         s = s.substr(pre_idx);
 555         idx = s.find_first_of('<',1);
 556         if( idx != string::npos ){
 557             dest = dest + s.substr(0,idx);
 558         }else{
 559             break;
 560         }
 561 
 562         s = s.substr(idx);
 563         idxHref=0; idxArea=0; idxImg=0;
 564     }while(1);
 565 
 566     s = dest;
 567 
 568     
 569     /* erase all '\' character
 570      * too avoid the following situations:
 571      *      document.write("<A href=\"/~webg/refpaper/index.html\">t2</A>");
 572     */
 573     CStrFun::EraseStr(s, "\\");
 574 
 575     if( s.size() < 20 ) return false;
 576 
 577     return true;
 578 }
 579 
 580 /*****************************************************************
 581 ** Function name: GetLinkInfo4SE()
 582 ** Input argv:
 583 **      --  
 584 ** Output argv:
 585 **      --
 586 ** Return:
 587        true: success
 588        false: fail
 589 ** Function Description:  Get links for SE
 590 ** Version: 1.0
 591 ** Be careful:
 592 *****************************************************************/
 593 
 594 //再从m_sContentLinkInfo提取出为搜索引擎准备的超链接
 595 bool CPage::GetLinkInfo4SE()
 596 {
 597 
 598     if( m_sContentLinkInfo.empty() ) return false;
 599 
 600     m_sLinkInfo4SE = m_sContentLinkInfo;
 601     string& s = m_sLinkInfo4SE;
 602 
 603      // Keep only <area ...>,and <a href ...> tags.
 604     string::size_type idxHref=0,idxArea=0,
 605         idx,pre_idx;
 606     string dest;
 607 
 608 
 609 
 610 
 611 
 612 
 613 
 614 
 615     /*
 616 
 617     例如:上面的m_sContentLinkInfo=href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ<img src="http://www.google.com.hk">
 618 
 619     我们这里提取出href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ    过滤掉<img src="http://www.google.com.hk">
 620 
 621     因为<img src="http://www.google.com.hk">的超链接是为历史网页存档准备的超链接
 622 
 623     */
 624 
 625 
 626     do{
 627         if( s.empty() ) break;
 628 
 629         //idxHref = CStrFun::FindCase(s, "<a href");
 630         idxHref = CStrFun::FindCase(s, "href");
 631         idxArea = CStrFun::FindCase(s, "<area ");
 632 
 633         pre_idx = idxHref > idxArea? idxArea: idxHref;
 634         //pre_idx = idxHref;
 635         if( pre_idx == string::npos) break;//终止条件
 636 
 637         s = s.substr(pre_idx);
 638         idx = s.find_first_of('<',1);
 639 
 640         if( !(s.length() < 4) )
 641         {
 642             idxHref = CStrFun::FindCaseFrom(s, "href", 4);
 643             idx = idx > idxHref ? idxHref: idx;
 644         }
 645 
 646         if( idx != string::npos ){
 647             dest = dest + s.substr(0,idx);
 648         }else if (idx == string::npos && pre_idx != string::npos){
 649             dest = dest + s;
 650             break;
 651         }else{
 652             break;
 653         }
 654 
 655         s = s.substr(idx);
 656         idxHref=0; idxArea=0;
 657     }while(1);
 658         
 659     s = dest;//dest保存着过滤后的数据
 660     if( s.length() < 20 ) return false;
 661 
 662 
 663     // erase all '"' , '\'', "&nbsp;".
 664     CStrFun::EraseStr(s, "\"");
 665     CStrFun::EraseStr(s, "'");
 666     CStrFun::EraseStr(s, "&nbsp");
 667 
 668      // Keep URLs and anchor text.
 669 
 670     idxHref=0;
 671     const string delims( " #>");
 672     dest.clear();
 673 
 674 
 675 
 676     /*
 677 
 678     通过上面的提取我们得到href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ
 679 
 680     我们再次提取
 681 
 682     m_sLinkInfo4SE="http://www.baidu.com/">百度"http://www.qq.com/">QQ
 683 
 684     */
 685 
 686 
 687 
 688     do{
 689         if( s.empty() ) break;
 690         idxHref = CStrFun::FindCase(s, "href");
 691 
 692         if( idxHref == string::npos) break;
 693         pre_idx = idxHref;
 694 
 695         //####
 696         idx = s.find('=', idxHref);
 697         if( idx == string::npos ) break;
 698         s = s.substr(idx+1);
 699 
 700         while( s.length() > 0 && s[0] == ' ' ){
 701             s.erase(0,1);
 702         }
 703         if( s.length() == 0 ) break;
 704 
 705         idx = s.find_first_of(delims,1);
 706         //cout << endl << s.substr(0, idx) << endl;
 707         if( idx == string::npos ) break;
 708 
 709         dest += '"' + s.substr(0, idx);
 710 
 711         //cout << endl << dest << endl;
 712             
 713         idx = s.find('>');
 714         if( idx == string::npos ) break;
 715         dest += '>';
 716         s = s.substr(idx +1);
 717             
 718         idx = s.find('<');
 719 
 720         if( !s.empty() ){
 721             idxHref = CStrFun::FindCase(s, "href");
 722             idx = idx > idxHref ? idxHref: idx;
 723         }    
 724 
 725         if( idx == string::npos ){
 726             dest += s;
 727             break;
 728         }
 729 
 730 /*
 731         if( idx == idxHref ){
 732             dest += '"' + s.substr(0,idx);
 733         }else{
 734 */
 735             dest += s.substr(0,idx);
 736         //}
 737         //####
 738 
 739         idxHref=0;
 740     }while(1);
 741         
 742     // look for empty filenames.
 743     idx = 0;
 744     while( (idx = dest.find("\"\"",idx)) != string::npos ){
 745         dest.erase(idx, 1);
 746     }
 747 
 748     s = dest;
 749 
 750     return( s.length() < 20 ? false: true );
 751 
 752 }
 753                     
 754 /*****************************************************************
 755 ** Function name: GetLinkInfo4History()
 756 ** Input argv:
 757 **      --  
 758 ** Output argv:
 759 **      --
 760 ** Return:
 761        true: success
 762        false: fail
 763 ** Function Description:  Get links for history archiving
 764 ** Version: 1.0
 765 ** Be careful:
 766 *****************************************************************/
 767 bool CPage::GetLinkInfo4History()
 768 {
 769     /*
 770 
 771     例如:上面的m_sContentLinkInfo=href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ<img src="http://www.google.com.hk">
 772 
 773     我们这里提取出<img src="http://www.google.com.hk">   过滤掉href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ
 774 
 775     因为href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ的超链接是为搜索引擎准备的超链接
 776 
 777     */
 778 
 779 
 780 
 781 
 782 
 783 
 784     if( m_sContentLinkInfo.empty() ) return false;
 785 
 786     m_sLinkInfo4History = m_sContentLinkInfo;
 787     string& s = this->m_sLinkInfo4History;
 788 
 789      // Keep only <img ...> tags.
 790     string::size_type idxImg=0,
 791         idx,pre_idx;
 792     string dest;
 793 
 794     do{
 795         if( s.empty() ) break;
 796         idxImg = CStrFun::FindCase(s, "<img");
 797 
 798         pre_idx = idxImg;
 799         if( pre_idx == string::npos) break;
 800 
 801         s = s.substr(pre_idx);
 802         idx = s.find_first_of('<',1);
 803 
 804         if( idx != string::npos ){
 805             dest = dest + s.substr(0,idx);
 806         }else if (idx == string::npos && pre_idx != string::npos){
 807             dest = dest + s;
 808             break;
 809         }else{
 810             break;
 811         }
 812 
 813         s = s.substr(idx);
 814         idxImg=0;
 815     }while(1);
 816         
 817     s = dest;
 818     if( s.length() < 20 ) return false;
 819 
 820     // erase all '"'. '\'',"&nbsp;".
 821     CStrFun::EraseStr(s , "\"");
 822     CStrFun::EraseStr(s , "'");
 823     CStrFun::EraseStr(s , "&nbsp");
 824 
 825      // Keep URLs and anchor text.
 826 
 827     idxImg=0;
 828     string::size_type idxSrc = 0;
 829     const string delims( " #>");
 830     dest.clear();
 831 
 832 
 833     /*
 834 
 835 通过上面的提取我们得到<img src="http://www.google.com.hk">
 836 
 837     我们再次提取
 838 
 839     m_sLinkInfo4History="http://www.google.com.hk>
 840 
 841 */
 842 
 843     do{
 844         if( s.empty() ) break;
 845         idxImg = CStrFun::FindCase(s, "img");
 846 
 847         if( idxImg == string::npos) break;
 848         pre_idx = idxImg;
 849 
 850         s = s.substr(idxImg+3);        // skip "img"
 851 
 852         //####
 853         idx = s.find('>', idxImg);
 854         if( idxImg == string::npos) break;
 855         if( s.empty() ) break;
 856         idxSrc = CStrFun::FindCase(s, "src");
 857         if( idxSrc > idxImg ) continue;
 858         s = s.substr(idxSrc);
 859 
 860         idx = s.find('=', idxImg);
 861         if( idx == string::npos ) break;
 862         s = s.substr(idx+1);
 863 
 864         while( s.length() > 0 && s[0] == ' ' ){
 865             s.erase(0,1);
 866         }
 867         if( s.length() == 0 ) break;
 868 
 869         idx = s.find_first_of(delims,1);
 870         if( idx == string::npos ) break;
 871 
 872         if( s.at(0) == '"'){
 873             dest += s.substr(0, idx);
 874         }else{
 875             dest += '"' + s.substr(0, idx);
 876         }
 877             
 878         idx = s.find('>');
 879         if( idx == string::npos ) break;
 880         dest += '>';
 881         s = s.substr(idx +1);
 882             
 883         idx = s.find('<');
 884         if( idx == string::npos ){
 885             dest += s;
 886             break;
 887         }
 888         dest += s.substr(0,idx);
 889         //####
 890 
 891         idxImg=0;
 892     }while(1);
 893         
 894 
 895     // look for empty filenames.
 896     idx = 0;
 897     while( (idx = dest.find("\"\"",idx)) != string::npos ){
 898         dest.erase(idx, 1);
 899     }
 900 
 901     s = dest;
 902 
 903     return( s.length() < 20 ? false: true );
 904 
 905 }
 906 
 907 
 908 
 909 
 910 //判断strUrl是不是正规的url
 911 bool CPage::NormalizeUrl(string& strUrl)
 912 {
 913     string::size_type idx;
 914 
 915 
 916     //URL没有htp://协议名我们这里认为strUrl不是正规的URL
 917     if( CStrFun::FindCase(strUrl, "http://") == string::npos ) return false;
 918 
 919     // convert "http://e.pku.cn" to "http://e.pku.cn/"
 920     //将http://www.baidu.com转化为http://www.baidu.com/
 921     idx = strUrl.rfind('/');
 922     if( idx < 8 ) {
 923         strUrl = strUrl + "/";
 924         return true;
 925     }
 926 
 927     //将"/./"-->"/"
 928     while( (idx=strUrl.find("/./")) != string::npos ){
 929         if( idx != string::npos ) strUrl.erase(idx,2);
 930     }
 931 
 932     //将"xxx/x/../yyy"-->xxx/yyy
 933     while( (idx = strUrl.find("/../")) != string::npos ){
 934         string strPre,strSuf;
 935 
 936         strPre = strUrl.substr(0, idx);
 937 
 938         if( strUrl.length() > idx+4 )
 939             strSuf = strUrl.substr(idx+4);
 940 
 941         idx = strPre.rfind("/");
 942         if( idx != string::npos)
 943             strPre = strPre.substr(0,idx+1);
 944         if( strPre.length() < 10 ) return false;
 945 
 946         strUrl = strPre + strSuf;
 947     }
 948 
 949     if( CStrFun::FindCase(strUrl, "http://") != 0 ) return false;
 950 
 951     return true;
 952 }
 953 
 954 
 955 
 956 
 957 
 958 
 959 
 960 /*最终得到为搜索引擎准备的超链接
 961 
 962   并将相对路径的URL和绝对路径的URL分别处理,同时,我们发现从一个网页中提取的超链接可以是相同的,这个时候
 963 
 964   我们必须去重,这个函数用map容器很好的做到了这一点
 965 
 966   还有一些URL不是正规的URL也要过滤
 967 
 968   还有一些URL是必要过滤也要过滤--通过IsFilterLink(string strUrl)实现
 969 
 970  */
 971 bool CPage::FindRefLink4SE()
 972 {
 973     if( m_sLinkInfo4SE.empty() ) return false;
 974 
 975     char *buffer = (char*)m_sLinkInfo4SE.c_str();
 976     int urlnum=0,len;
 977     char *ptr ;
 978 
 979     static char buf[URL_REFERENCE_LEN];
 980 
 981     memset(buf, 0, URL_REFERENCE_LEN);
 982     len = strlen(buffer);
 983     if( len < 8 ) return false;
 984 
 985     len = len < URL_REFERENCE_LEN -1 ? len : URL_REFERENCE_LEN - 1;//len记录相对较小的值
 986     strncpy( buf, buffer, len);
 987 
 988 /*first
 989  *------>
 990  */
 991 
 992 
 993     /*
 994 
 995     例如:m_sLinkInfo4SE="http://www.baidu.com/">百度"http://www.qq.com/">QQ
 996 
 997     我们这里提取为
 998 
 999     http://www.baidu.com  百度
1000 
1001     http://www.qqq.com    QQ
1002 
1003     */
1004     ptr = buf;
1005     while( ptr - buf < len  && *ptr )
1006     {
1007         while( *ptr == '"' && *ptr) ptr++;
1008         if ( !*ptr ) break;
1009         this->m_RefLink4SE[ urlnum].link = ptr;//每个网页里最多有1000个链接
1010         while( *ptr && *ptr != '>')
1011         {
1012             //在遇到'>'之前,出现了' '字符,我们必须将' '字符赋值为'\0'说明URL提取完了,因为URL不可能出现' '字符
1013             if(*ptr == ' ') *ptr = '\0';
1014             //例如: "http://www.baidu.com/" height=100 width=150>百度   出现空格说明还有其他的属性值
1015             ptr++;
1016         }
1017 
1018         if ( !*ptr ){
1019             urlnum++;
1020             break;
1021         }
1022         if ( *ptr == '>' )
1023         {
1024             *ptr++='\0';
1025             if( !*ptr )
1026             {
1027                 urlnum++;
1028                 break;
1029             }
1030             
1031             if( *ptr == '"' )
1032             {
1033                 this->m_RefLink4SE[urlnum].anchor_text = NULL;
1034             }
1035             else
1036             {
1037                 this->m_RefLink4SE[urlnum].anchor_text = ptr;
1038                 while( *ptr && *ptr != '"') ptr++;
1039                 if (!*ptr)
1040                 {
1041                     urlnum++;
1042                     break;
1043                 }
1044                 if ( *ptr == '"') *ptr='\0';
1045             }
1046 
1047         }
1048         
1049         //cout << endl << this->m_RefLink4SE[ urlnum].link << '\t';
1050         //cout << this->m_RefLink4SE[ urlnum].anchor_text << endl;
1051 
1052         ptr++;
1053         urlnum++;
1054         if ( urlnum == MAX_URL_REFERENCES) break; //达到最多的url数目
1055     }
1056     //cout << endl << this->m_RefLink4SE[ urlnum].link << endl;
1057     //cout << this->m_RefLink4SE[ urlnum].anchor_text << endl;
1058 
1059     this->m_nRefLink4SENum = urlnum;
1060 
1061 /*second
1062  *------>
1063  */
1064     //typedef map<string,string,less<string> >::value_type valType;
1065     typedef map<string,string>::value_type valType;
1066 
1067     m_mapLink4SE.clear();
1068 
1069     //string strRootUrl= m_sUrl;
1070     CUrl iUrl;
1071     if( iUrl.ParseUrlEx(m_sUrl) == false )
1072     {
1073         cout << "ParseUrlEx error in FindRefLink4SE(): " << m_sUrl << endl;
1074         return false;
1075     }
1076     
1077     for(int i=0; i<m_nRefLink4SENum; i++)
1078     {
1079 
1080         string str;
1081         string::size_type idx;
1082         const string delims(" #");
1083 
1084         str = m_RefLink4SE[i].link;
1085         idx = str.find_first_of(delims, 0 );
1086         if( idx != string::npos )//如果找到标志
1087         {
1088             str = str.substr(0, idx);//只取#前边的url
1089         }
1090         if( str.size() == 0 || str.size() > URL_LEN - 1 || str.size() < 4 ) 
1091             continue;
1092 
1093 
1094         string::size_type idx1;
1095         idx1 = CStrFun::FindCase(str, "http");
1096         if( idx1 != 0  )//str有可能是相对路径
1097         {
1098             char c1 = m_sUrl.at(m_sUrl.length()-1);
1099             char c2 = str.at(0);
1100 
1101             if( c2=='/' )//str一定是相对路径
1102             {
1103                 if( iUrl.m_nPort != 80 )//若是http
1104                 {
1105                     cout << iUrl.m_sHost << endl;
1106                     cout << str << endl;
1107                     //str = "http://" + iUrl.m_sHost + ":" + (const char*)(iUrl.m_nPort) + str;
1108                     str = "http://" + iUrl.m_sHost + ":" + CStrFun::itos(iUrl.m_nPort) + str;
1109                 } 
1110                 else 
1111                 {
1112                     str = "http://" + iUrl.m_sHost + str;
1113                 }
1114             } 
1115             else if( c1!='/' && c2!='/')//若两个都不是,则加上/构成新的url
1116             {
1117                 string::size_type idx;
1118 
1119                 idx = m_sUrl.rfind('/');
1120                 if( idx != string::npos )//若不是最后
1121                 {
1122                     if( idx > 6 )
1123                     { // > strlen("http://..")
1124                         str = m_sUrl.substr(0, idx+1) + str;
1125                     } 
1126                     else 
1127                     {
1128                         str = m_sUrl + "/" + str;
1129                     }
1130 
1131                 } else {
1132 
1133                     continue;
1134                 }
1135 
1136             } 
1137             else 
1138             {
1139                 if( c1=='/' )
1140                 {
1141                     str = m_sUrl + str;
1142                 }
1143                 else 
1144                 {
1145                     str = m_sUrl + "/" + str;
1146                 }
1147             }
1148         }
1149 
1150         if( NormalizeUrl(str) == false ) continue;
1151 
1152         if( IsFilterLink(str) ) continue;
1153 
1154         //debug
1155         //cout << "reflink: " << reflink << endl;
1156 
1157         if( str == m_sUrl )//一个网页中提取的超链接是其本身,我就不要了,因为我们已经有了这个网页的URL了
1158         {
1159             continue;
1160         }
1161         else
1162         {
1163             if( m_RefLink4SE[i].anchor_text )//有URL的描述符
1164             {
1165                 if( m_mapLink4SE.find(str) == m_mapLink4SE.end() )
1166                 {
1167                     m_mapLink4SE.insert( valType( str, m_RefLink4SE[i].anchor_text));
1168                 }
1169             }
1170             else//没有URL的描述符---这个时候描述符为'\0'
1171             {
1172                 if( m_mapLink4SE.find(str) == m_mapLink4SE.end() )
1173                 {
1174                     m_mapLink4SE.insert( valType( str, "\0") );
1175                     cout << ".";
1176                 }
1177             }
1178         }
1179             
1180 
1181     }
1182 
1183     m_nRefLink4SENum = m_mapLink4SE.size();
1184 
1185     //cout << endl;
1186 
1187     return true;
1188 }
1189 
1190 
1191 
1192 
1193 
1194 //最终得到为历史网页存档准备的超链接
1195 
1196 //并将相对路径的URL和绝对路径的URL分别处理,同时,我们发现从一个网页中提取的超链接可以是相同的,这个时候
1197 
1198 //我们必须去重,这个函数用vector容器很好的做到了这一点
1199 
1200 //还有一些URL不是正规的URL也要过滤
1201 
1202 //还有一些URL是必要过滤也要过滤--通过IsFilterLink(string strUrl)实现
1203 bool CPage::FindRefLink4History()
1204 {
1205     if( m_sLinkInfo4History.empty() ) return false;
1206 
1207     char *buffer = (char*)m_sLinkInfo4History.c_str();
1208     int urlnum=0,len;
1209     char *ptr ;
1210 
1211     static char buf[URL_REFERENCE_LEN/2];
1212 
1213     memset(buf, 0, URL_REFERENCE_LEN/2);
1214     len = strlen(buffer);
1215     if( len < 8 ) return false;
1216 
1217     len = len < URL_REFERENCE_LEN/2 - 1? len : URL_REFERENCE_LEN/2 -1;
1218     strncpy( buf, buffer, len);
1219 
1220 /*first
1221  *------>
1222  */
1223     ptr = buf;
1224     while( ptr - buf < len  && *ptr ){
1225         while( *ptr == '"' && *ptr) ptr++;
1226         if ( !*ptr ) break;
1227         this->m_RefLink4History[ urlnum].link = ptr;
1228 
1229         while( *ptr && *ptr != '>'){
1230             if( *ptr == ' ') *ptr='\0';
1231             ptr++;
1232         }
1233 
1234         if( !*ptr){
1235             urlnum++;
1236             break;
1237         }
1238         if( *ptr == '>' ){
1239             *ptr++ = 0;
1240             if( !*ptr ){
1241                 urlnum++;
1242                 break;
1243             }
1244             if( *ptr == '"' ){
1245             
1246             }else{
1247                 while( *ptr && *ptr != '"') ptr++;
1248                 if( !*ptr ){
1249                     urlnum++;
1250                     break;
1251                 }
1252                 if ( *ptr == '"' ) *ptr++='\0';
1253             }
1254         }
1255         
1256         ptr++;
1257         urlnum++;
1258         if ( urlnum == MAX_URL_REFERENCES/2) break;
1259     }
1260 
1261 
1262     this->m_nRefLink4HistoryNum = urlnum;
1263 
1264 /*second
1265  *------>
1266  */
1267     m_vecLink4History.clear();
1268     //string strRootUrl= m_sUrl;
1269         CUrl iUrl;
1270         if( iUrl.ParseUrlEx(m_sUrl) == false ){
1271         cout << "ParseUrlEx error in FindRefLink4History(): " << m_sUrl << endl;
1272         return false;
1273     }
1274 
1275     for(int i=0; i<m_nRefLink4HistoryNum; i++){
1276         string str;
1277         //string::size_type idx;
1278 
1279         str = m_RefLink4History[i].link;
1280         if( str.size()==0 || str.size() > URL_LEN - 1 
1281             || str.size() < 4 ) continue;
1282 
1283 /*
1284         char *pdest1, *pdest2;
1285         pdest1 = strstr( str.c_str(), "http" );
1286         pdest2 = strstr( str.c_str(), "HTTP" );
1287         if( pdest1==NULL && pdest2==NULL ){
1288 */
1289 
1290         string::size_type idx1;
1291         idx1 = CStrFun::FindCase(str, "http");
1292         if( idx1 != 0 ){
1293             char c1 = m_sUrl.at(m_sUrl.length()-1);
1294             char c2 = str.at(0);
1295 
1296             if( c2=='/' ){
1297                 if( iUrl.m_nPort != 80 ){
1298                     str = "http://" + iUrl.m_sHost + ":" + CStrFun::itos(iUrl.m_nPort) + str;
1299                 } else {
1300                     str = "http://" + iUrl.m_sHost + str;
1301                 }
1302             } else if( c1!='/' && c2!='/'){
1303                 string::size_type idx;
1304 
1305                 idx = m_sUrl.rfind('/');
1306                 if( idx != string::npos ){
1307                     if( idx > 6 ){ // > strlen("http://..")
1308                         str = m_sUrl.substr(0, idx+1) + str;
1309                     } else {
1310                         str = m_sUrl + "/" + str;
1311                     }
1312 
1313                 } else {
1314 
1315                     continue;
1316                 }
1317 
1318             } else {
1319                 if( c1=='/' ){
1320                     str = m_sUrl + str;
1321                 } else {
1322                     str = m_sUrl + "/" + str;
1323                 }
1324             }
1325         }
1326 
1327         // due to bad link parser
1328 /*
1329 
1330         idx = reflink.find(' ');
1331         if(idx != string::npos){
1332             reflink = reflink.substr(0,idx);
1333         }
1334         idx = reflink.find('"');
1335         if(idx != string::npos){
1336             reflink = reflink.substr(0,idx);
1337         }
1338 */
1339         //#############
1340 
1341         if( NormalizeUrl(str) == false ) continue;
1342 
1343 
1344         if( IsFilterLink(str) ) continue;
1345 
1346 
1347         if( str == m_sUrl ){
1348             continue;
1349         }else{
1350             vector<string>::iterator it;
1351             it = find(m_vecLink4History.begin(), m_vecLink4History.end(),str);
1352             if( it == m_vecLink4History.end() ){
1353 
1354                 m_vecLink4History.push_back( str);
1355                 cout << ".";
1356             }
1357         }
1358             
1359 
1360     }
1361     m_nRefLink4HistoryNum = m_vecLink4History.size();
1362     //cout << endl;
1363 
1364     return true;
1365 }
原文地址:https://www.cnblogs.com/kakamilan/p/2572060.html