1 /*Page handling
2 */
3
4 #include <iostream>
5 #include <string>
6 #include <cstring>
7 #include <map>
8 #include <vector>
9 #include <iterator>
10 #include "Url.h"
11 #include "Page.h"
12 #include "StrFun.h"
13
14
15 //带参构造函数
16 CPage::CPage()
17 {
18 //初始化成员变量
19 m_nStatusCode = 0;
20 m_nContentLength = 0;
21 m_sLocation = "";
22 m_bConnectionState = false;
23 m_sContentEncoding = "";
24 m_sContentType = "";
25 m_sCharset = "";
26 m_sTransferEncoding = "";
27
28 m_sContentLinkInfo = "";
29 m_sLinkInfo4SE = "";
30 m_sLinkInfo4History = "";
31
32 m_sContentNoTags = "";
33 m_nRefLink4SENum = 0;
34 m_nRefLink4HistoryNum = 0;
35 m_eType = PLAIN_TEXT;
36
37
38 //超链接信息以及超链接的描述信息初始化都为空
39 for(int i=0; i< MAX_URL_REFERENCES; i++ ){
40 m_RefLink4SE[i].link = NULL;
41 m_RefLink4SE[i].anchor_text = NULL;
42 m_RefLink4SE[i].strCharset = "";
43
44 if(i < MAX_URL_REFERENCES/2){
45 m_RefLink4History[i].link = NULL;
46 }
47 }
48
49 }
50
51 CPage::CPage( string strUrl, string strLocation, char* header, char* body, int nLenBody)
52 {
53 //assert( header != NULL );
54 //assert( body != NULL );
55 //assert( nLenBody > 0 );
56
57 // CPage();
58 m_nStatusCode = 0;
59 m_nContentLength = 0;
60 m_sLocation = "";
61 m_bConnectionState = false;
62 m_sContentEncoding = "";
63 m_sContentType = "";
64 m_sCharset = "";
65 m_sTransferEncoding = "";
66
67 m_sContentLinkInfo = "";
68 m_sLinkInfo4SE = "";
69 m_sLinkInfo4History = "";
70
71 m_sContentNoTags = "";
72 m_nRefLink4SENum = 0;
73 m_nRefLink4HistoryNum = 0;
74 m_eType = PLAIN_TEXT;
75
76 //超链接信息以及超链接的描述信息初始化都为空
77 for(int i=0; i< MAX_URL_REFERENCES; i++ ){
78 m_RefLink4SE[i].link = NULL;
79 m_RefLink4SE[i].anchor_text = NULL;
80 m_RefLink4SE[i].strCharset = "";
81
82 if(i < MAX_URL_REFERENCES/2){
83 m_RefLink4History[i].link = NULL;
84 }
85 }
86
87 //将构造函数传入的参数赋值给成员变量
88 m_sUrl = strUrl;//网页对应的URL
89 m_sLocation = strLocation;//网页重定向的URL,没有重定向则传入为空,否则传入重定向的URL信息
90 m_sHeader = header;//网页的头信息
91 m_nLenHeader = strlen(header);//网页头信息的长度
92
93 m_sContent.assign(body, nLenBody);//网页体信息,用body所指向数组的前nLenBody个字符副本替换m_sContent
94 m_nLenContent = nLenBody;//网页体信息的长度
95
96 }
97
98 CPage::~CPage()
99 {
100 }
101
102
103 //解析网页头信息---调用8个私有的成员函数
104 void CPage::ParseHeaderInfo(string strHeader)
105 {
106 GetStatusCode(strHeader);
107 GetContentLength(strHeader);
108 GetLocation(strHeader);
109 GetConnectionState(strHeader);
110
111 GetCharset(strHeader);
112
113 GetContentEncoding(strHeader);
114 GetContentType(strHeader);
115 GetTransferEncoding(strHeader);
116 }
117
118 //得到状态码
119 void CPage::GetStatusCode(string headerBuf)
120 {
121 //例如:
122
123 //HTTP/1.0 200 OK 200就是状态码
124 CStrFun::Str2Lower( headerBuf, headerBuf.length() );
125
126 char *charIndex = strstr(headerBuf.c_str(), "http/");//在字符串headerBuf中查找第一出现"http/"的位置
127 if (charIndex == NULL)
128 {
129 m_nStatusCode = -1;
130 return;
131 }
132 //吃掉所有无关的字符
133 while(*charIndex != ' '){
134 charIndex++;
135 }
136 charIndex++;
137
138 int ret = sscanf(charIndex, "%i", &m_nStatusCode);//格式化字符串输入
139 if (ret != 1) m_nStatusCode = -1;
140 }
141
142
143
144 //从网页头信息中提取的网页体的长度,一般不是很准
145 void CPage::GetContentLength(string headerBuf)
146 {
147 //例如:
148
149 //content-length: 21237 21237就是网页体的长度,这个属性值是服务器返回的,不一定正确
150 CStrFun::Str2Lower( headerBuf, headerBuf.length() );
151
152 char *charIndex = strstr(headerBuf.c_str(), "content-length");
153 if (charIndex == NULL) return;
154
155 while(*charIndex != ' '){
156 charIndex++;
157 }
158 charIndex++;
159
160 int ret = sscanf(charIndex, "%i", &m_nContentLength);
161 if (ret != 1) m_nContentLength = -1;
162 }
163
164
165 //得到重定向信息
166 void CPage::GetLocation(string headerBuf)
167 {
168 string::size_type pre_idx,idx;
169 const string delims("\r\n");
170
171 string strBuf = headerBuf;
172 CStrFun::Str2Lower( headerBuf, headerBuf.length() );
173
174 idx = headerBuf.find("location:");
175 if (idx != string::npos)//若找到
176 {
177 pre_idx = idx + sizeof("location: ") -1;
178 idx = headerBuf.find_first_of(delims, pre_idx );//查找换行符
179 if (idx != string::npos)
180 {
181 //m_sLocation = headerBuf.substr(pre_idx, idx - pre_idx);
182 m_sLocation = strBuf.substr(pre_idx, idx - pre_idx);
183 }
184 }
185 }
186
187
188 //得到网页字符集
189 void CPage::GetCharset(string headerBuf)
190 {
191 string::size_type pre_idx,idx;
192 const string delims(" \",;>");
193
194 CStrFun::Str2Lower(headerBuf, headerBuf.size());
195
196 idx = headerBuf.find("charset=");
197 if( idx != string::npos) {
198 m_sCharset = headerBuf.substr(idx + sizeof("charset=") -1);//保存从charset=开始的所有字符串
199 }
200
201 headerBuf = m_sContent;
202 headerBuf = headerBuf.substr(0,2024) ;
203 CStrFun::Str2Lower( headerBuf, headerBuf.length() );
204 idx = headerBuf.find("charset=");
205 if (idx != string::npos)//后边有可能有多余的信息
206 {
207 pre_idx = idx + sizeof("charset=") -1;
208 idx = headerBuf.find_first_of(delims, pre_idx );
209 if(idx != string::npos){
210 m_sCharset = headerBuf.substr(pre_idx, idx - pre_idx);
211 }
212 }
213 }
214
215
216 //得到网页体编码
217 void CPage::GetContentEncoding(string headerBuf)
218 {
219 string::size_type pre_idx,idx;
220 const string delims("\r\n");
221
222 CStrFun::Str2Lower( headerBuf, headerBuf.length() );
223
224 idx = headerBuf.find("content-encoding:");
225 if (idx != string::npos)
226 {
227 pre_idx = idx + sizeof("content-encoding: ") -1;
228 idx = headerBuf.find_first_of(delims, pre_idx );
229 if (idx != string::npos)
230 {
231 m_sContentEncoding = headerBuf.substr(pre_idx, idx - pre_idx);
232 }
233 }
234 }
235
236 //得到连接状态
237 void CPage::GetConnectionState(string headerBuf)
238 {
239 string::size_type pre_idx,idx;
240 const string delims(";\r\n");
241
242 CStrFun::Str2Lower( headerBuf, headerBuf.length() );
243
244 idx = headerBuf.find("connection:");
245 if (idx != string::npos)
246 {
247 pre_idx = idx + sizeof("connection: ") -1;
248 idx = headerBuf.find_first_of(delims, pre_idx );
249 if (idx != string::npos)
250 {
251 string str = headerBuf.substr(pre_idx, idx - pre_idx);
252 //cout << "Connection state: " << str << endl;
253 //if (str == "close") m_bConnectionState = false;
254 if (str == "keep-alive") m_bConnectionState = true;
255 }
256 }
257 }
258
259 //得到网页体类型
260 void CPage::GetContentType(string headerBuf)
261 {
262 string::size_type pre_idx,idx;
263 const string delims(";\r\n");
264
265 CStrFun::Str2Lower( headerBuf, headerBuf.size() );
266
267 idx = headerBuf.find("content-type:");
268 if (idx != string::npos)
269 {
270 pre_idx = idx + sizeof("content-type: ") -1;
271 idx = headerBuf.find_first_of(delims, pre_idx );
272 if (idx != string::npos)
273 {
274 m_sContentType = headerBuf.substr(pre_idx, idx - pre_idx);
275 }
276 }
277 }
278
279 //得到网页体的传输编码方式
280 void CPage::GetTransferEncoding(string headerBuf)
281 {
282 string::size_type pre_idx,idx;
283 const string delims(";\r\n");
284
285 CStrFun::Str2Lower( headerBuf, headerBuf.size() );
286
287 idx = headerBuf.find("transfer-encoding:");
288 if ( idx != string::npos)
289 {
290 pre_idx = idx + sizeof("transfer-encoding: ") -1;
291 idx = headerBuf.find_first_of(delims, pre_idx );
292 if(idx != string::npos)
293 {
294 m_sTransferEncoding = headerBuf.substr(pre_idx, idx - pre_idx);
295 }
296 }
297 }
298
299 /*
300 * Filter spam links
301 * If it is, return ture; otherwise false
302 */
303 //判断一个URL是不是应该过滤,要过滤返回true否则返回false
304 bool CPage::IsFilterLink(string plink)
305 {
306 if( plink.empty() ) return true;
307 if( plink.size() > URL_LEN ) return true;
308
309 string link = plink, tmp;
310 string::size_type idx = 0;
311
312
313 CStrFun::Str2Lower( link, link.length() );//link字符串中的字母全部变成小写
314
315 // find two times following symbols, return false
316 tmp = link;
317 idx = tmp.find("?");//URL中出现2个'?'字符要过滤
318 if( idx != string::npos ){
319 tmp = tmp.substr(idx+1);
320 idx = tmp.find("?");
321 if( idx != string::npos ) return true;
322 }
323
324 tmp = link;//先后出现'-'和'+'字符要过滤
325 idx = tmp.find("-");
326 if( idx != string::npos ){
327 tmp = tmp.substr(idx+1);
328 idx = tmp.find("+");
329 if( idx != string::npos ) return true;
330 }
331
332 //出现2个'&'字符要过滤
333 tmp = link;
334 idx = tmp.find("&");
335 if( idx != string::npos ){
336 tmp = tmp.substr(idx+1);
337 idx = tmp.find("&");
338 if( idx != string::npos ) return true;
339 }
340
341 //出现2个"//"字符要过滤
342 tmp = link;
343 idx = tmp.find("//");
344 if( idx != string::npos ){
345 tmp = tmp.substr(idx+1);
346 idx = tmp.find("//");
347 if( idx != string::npos ) return true;
348 }
349
350 //出现2个"http"要过滤
351 tmp = link;
352 idx = tmp.find("http");
353 if( idx != string::npos ){
354 tmp = tmp.substr(idx+1);
355 idx = tmp.find("http");
356 if( idx != string::npos ) return true;
357 }
358
359 //出现2个"misc"要过滤
360 tmp = link;
361 idx = tmp.find("misc");
362 if( idx != string::npos ){
363 tmp = tmp.substr(idx+1);
364 idx = tmp.find("misc");
365 if( idx != string::npos ) return true;
366 }
367
368 //出现2个"ipb"要过滤
369 tmp = link;
370 idx = tmp.find("ipb");
371 if( idx != string::npos ){
372 tmp = tmp.substr(idx+1);
373 idx = tmp.find("ipb");
374 if( idx != string::npos ) return true;
375 }
376
377 const char *filter_str[]={
378 "cgi-bin", "htbin", "linder", "srs5", "uin-cgi", // robots.txt of http://www.expasy.org/
379 "uhtbin", "snapshot", "=+", "=-", "script",
380 "gate", "search", "clickfile", "data/scop", "names",
381 "staff/", "enter", "user", "mail", "pst?",
382 "find?", "ccc?", "fwd?", "tcon?", "&",
383 "counter?", "forum", "cgisirsi", "{", "}",
384 "proxy", "login", "00.pl?", "sciserv.pl", "sign.asp",
385 "<", ">", "review.asp?", "result.asp?", "keyword",
386 "\"", "'", "php?s=", "error", "showdate",
387 "niceprot.pl?", "volue.asp?id", ".css", ".asp?month", "prot.pl?",
388 "msg.asp", "register.asp", "database", "reg.asp", "qry?u",
389 "p?msg", "tj_all.asp?page", ".plot.", "comment.php", "nicezyme.pl?",
390 "entr", "compute-map?", "view-pdb?", "list.cgi?", "lists.cgi?",
391 "details.pl?", "aligner?", "raw.pl?", "interface.pl?","memcp.php?",
392 "member.php?", "post.php?", "thread.php", "bbs/", "/bbs"
393 };
394 int filter_str_num = 75;
395
396 //说明找到了上述字符串要过滤
397 for(int i=0; i<filter_str_num; i++){
398 if( link.find(filter_str[i]) != string::npos)
399 return true;
400 }
401
402 return false;
403 }
404
405 /////////////////////////////
406 // just for ImgSE
407 // e.g: http://www.people.com.cn/GB/tupian/index.html
408 // http://news.xinhuanet.com/photo/
409 // http://photo.tom.com/
410 /////////////////////////////
411 // comment previous one and open this one
412
413 /*
414 bool CPage::IsFilterLink(string plink)
415 {
416 if( plink.empty() ) return true;
417 if( plink.size() > URL_LEN ) return true;
418
419 return false;
420
421 string link = plink, tmp;
422 string::size_type idx = 0;
423
424
425 CStrFun::Str2Lower( link, link.length() );
426
427 const char *filter_str[]={
428 "tupian", "photo", "ttjstk"
429 };
430 int filter_str_num = 3;
431
432 CStrFun::Str2Lower( link, link.length() );
433
434 for(int i=0; i<filter_str_num; i++){
435 if( link.find(filter_str[i]) != string::npos)
436 return false;
437 }
438
439 return true;
440 }
441 */
442
443
444 /*****************************************************************
445 ** Function name: ParseHyperLinks
446 ** Input argv:
447 ** --
448 ** Output argv:
449 ** --
450 ** Return:
451 true: success
452 false: fail
453 ** Function Description: Parse hyperlinks from the web page
454 ** Version: 1.0
455 ** Be careful:
456 *****************************************************************/
457 bool CPage::ParseHyperLinks()
458 {
459 if( GetContentLinkInfo() == false ) return false;
460
461 if( m_sContentLinkInfo.empty() ) return false;
462
463 bool bFind4SE = false;
464 bool bFind4History = false;
465 if( GetLinkInfo4SE() ){
466 if( FindRefLink4SE() ) bFind4SE = true;
467 }
468
469 if( GetLinkInfo4History() ){
470 if( FindRefLink4History() ) bFind4History = true;
471 }
472
473 //如果没有从网页中提取出为搜索引擎或者为历史网页存档准备的超链接则返回false
474 if( !bFind4SE && !bFind4History ){
475 return false;
476 }
477
478 //return GetHref(m_sContentLinkInfo.c_str(), "href", m_listLink4SE);
479
480 return true;
481 }
482
483
484 /*****************************************************************
485 ** Function name: GetContentLinkInfo
486 ** Input argv:
487 ** --
488 ** Output argv:
489 ** --
490 ** Return:
491 true: success
492 false: fail
493 ** Function Description: Parse hyperlinks from the web page
494 ** Version: 1.0
495 ** Be careful:
496 *****************************************************************/
497
498 //从网页体中提取出包含超链接信息的标识
499 bool CPage::GetContentLinkInfo()
500 {
501 if( m_sContent.empty() ) return false;
502
503 m_sContentLinkInfo = m_sContent;
504
505 string& s = m_sContentLinkInfo; //引用调用
506
507 // transform all separation into one space character
508 //CStrFun::ReplaceStr(s, "\t", " ");
509 //CStrFun::ReplaceStr(s, "\r", " ");
510 //CStrFun::ReplaceStr(s, "\n", " ");
511 const string delims(" \t\r\n");
512 string::size_type idx=0, pre_idx;
513
514 //找到所有的"\t\r\n"并将'\t'替换为' ' 如果是\t\t\r\n则删除一个\t
515 while( (idx = s.find_first_of(delims, idx)) != string::npos )
516 {
517 pre_idx = idx;
518 s.replace(idx,1,1,' ');
519 idx++;
520
521 while( (idx = s.find_first_of(delims, idx)) != string::npos )
522 {
523 if( idx-pre_idx == 1 ){
524 s.erase(idx, 1);
525 } else {
526 break;
527 }
528 }
529
530 idx--;
531 }
532
533 // transform all "<br>" into one space character
534 //将s中<br>标记全部替换为空格
535 CStrFun::ReplaceStr(s, "<br>", " ");
536
537 if( s.size() < 20 ) return false;
538
539 // Keep only <img ...>, <area ...>,<script ...> and <a href ...> tags.
540 string::size_type idxHref=0,idxArea=0,idxImg=0;
541 string dest;
542
543 do{
544 if( s.empty() ) break;
545
546 idxHref = CStrFun::FindCase(s, "href");
547 idxArea = CStrFun::FindCase(s, "<area");
548 idxImg = CStrFun::FindCase(s, "<img");
549
550 pre_idx = idxHref > idxArea? idxArea: idxHref;
551 pre_idx = idxImg > pre_idx? pre_idx: idxImg;
552 if( pre_idx == string::npos) break;
553
554 s = s.substr(pre_idx);
555 idx = s.find_first_of('<',1);
556 if( idx != string::npos ){
557 dest = dest + s.substr(0,idx);
558 }else{
559 break;
560 }
561
562 s = s.substr(idx);
563 idxHref=0; idxArea=0; idxImg=0;
564 }while(1);
565
566 s = dest;
567
568
569 /* erase all '\' character
570 * too avoid the following situations:
571 * document.write("<A href=\"/~webg/refpaper/index.html\">t2</A>");
572 */
573 CStrFun::EraseStr(s, "\\");
574
575 if( s.size() < 20 ) return false;
576
577 return true;
578 }
579
580 /*****************************************************************
581 ** Function name: GetLinkInfo4SE()
582 ** Input argv:
583 ** --
584 ** Output argv:
585 ** --
586 ** Return:
587 true: success
588 false: fail
589 ** Function Description: Get links for SE
590 ** Version: 1.0
591 ** Be careful:
592 *****************************************************************/
593
594 //再从m_sContentLinkInfo提取出为搜索引擎准备的超链接
595 bool CPage::GetLinkInfo4SE()
596 {
597
598 if( m_sContentLinkInfo.empty() ) return false;
599
600 m_sLinkInfo4SE = m_sContentLinkInfo;
601 string& s = m_sLinkInfo4SE;
602
603 // Keep only <area ...>,and <a href ...> tags.
604 string::size_type idxHref=0,idxArea=0,
605 idx,pre_idx;
606 string dest;
607
608
609
610
611
612
613
614
615 /*
616
617 例如:上面的m_sContentLinkInfo=href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ<img src="http://www.google.com.hk">
618
619 我们这里提取出href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ 过滤掉<img src="http://www.google.com.hk">
620
621 因为<img src="http://www.google.com.hk">的超链接是为历史网页存档准备的超链接
622
623 */
624
625
626 do{
627 if( s.empty() ) break;
628
629 //idxHref = CStrFun::FindCase(s, "<a href");
630 idxHref = CStrFun::FindCase(s, "href");
631 idxArea = CStrFun::FindCase(s, "<area ");
632
633 pre_idx = idxHref > idxArea? idxArea: idxHref;
634 //pre_idx = idxHref;
635 if( pre_idx == string::npos) break;//终止条件
636
637 s = s.substr(pre_idx);
638 idx = s.find_first_of('<',1);
639
640 if( !(s.length() < 4) )
641 {
642 idxHref = CStrFun::FindCaseFrom(s, "href", 4);
643 idx = idx > idxHref ? idxHref: idx;
644 }
645
646 if( idx != string::npos ){
647 dest = dest + s.substr(0,idx);
648 }else if (idx == string::npos && pre_idx != string::npos){
649 dest = dest + s;
650 break;
651 }else{
652 break;
653 }
654
655 s = s.substr(idx);
656 idxHref=0; idxArea=0;
657 }while(1);
658
659 s = dest;//dest保存着过滤后的数据
660 if( s.length() < 20 ) return false;
661
662
663 // erase all '"' , '\'', " ".
664 CStrFun::EraseStr(s, "\"");
665 CStrFun::EraseStr(s, "'");
666 CStrFun::EraseStr(s, " ");
667
668 // Keep URLs and anchor text.
669
670 idxHref=0;
671 const string delims( " #>");
672 dest.clear();
673
674
675
676 /*
677
678 通过上面的提取我们得到href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ
679
680 我们再次提取
681
682 m_sLinkInfo4SE="http://www.baidu.com/">百度"http://www.qq.com/">QQ
683
684 */
685
686
687
688 do{
689 if( s.empty() ) break;
690 idxHref = CStrFun::FindCase(s, "href");
691
692 if( idxHref == string::npos) break;
693 pre_idx = idxHref;
694
695 //####
696 idx = s.find('=', idxHref);
697 if( idx == string::npos ) break;
698 s = s.substr(idx+1);
699
700 while( s.length() > 0 && s[0] == ' ' ){
701 s.erase(0,1);
702 }
703 if( s.length() == 0 ) break;
704
705 idx = s.find_first_of(delims,1);
706 //cout << endl << s.substr(0, idx) << endl;
707 if( idx == string::npos ) break;
708
709 dest += '"' + s.substr(0, idx);
710
711 //cout << endl << dest << endl;
712
713 idx = s.find('>');
714 if( idx == string::npos ) break;
715 dest += '>';
716 s = s.substr(idx +1);
717
718 idx = s.find('<');
719
720 if( !s.empty() ){
721 idxHref = CStrFun::FindCase(s, "href");
722 idx = idx > idxHref ? idxHref: idx;
723 }
724
725 if( idx == string::npos ){
726 dest += s;
727 break;
728 }
729
730 /*
731 if( idx == idxHref ){
732 dest += '"' + s.substr(0,idx);
733 }else{
734 */
735 dest += s.substr(0,idx);
736 //}
737 //####
738
739 idxHref=0;
740 }while(1);
741
742 // look for empty filenames.
743 idx = 0;
744 while( (idx = dest.find("\"\"",idx)) != string::npos ){
745 dest.erase(idx, 1);
746 }
747
748 s = dest;
749
750 return( s.length() < 20 ? false: true );
751
752 }
753
754 /*****************************************************************
755 ** Function name: GetLinkInfo4History()
756 ** Input argv:
757 ** --
758 ** Output argv:
759 ** --
760 ** Return:
761 true: success
762 false: fail
763 ** Function Description: Get links for history archiving
764 ** Version: 1.0
765 ** Be careful:
766 *****************************************************************/
767 bool CPage::GetLinkInfo4History()
768 {
769 /*
770
771 例如:上面的m_sContentLinkInfo=href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ<img src="http://www.google.com.hk">
772
773 我们这里提取出<img src="http://www.google.com.hk"> 过滤掉href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ
774
775 因为href="http://www.baidu.com/">百度href="http://www.qq.com/">QQ的超链接是为搜索引擎准备的超链接
776
777 */
778
779
780
781
782
783
784 if( m_sContentLinkInfo.empty() ) return false;
785
786 m_sLinkInfo4History = m_sContentLinkInfo;
787 string& s = this->m_sLinkInfo4History;
788
789 // Keep only <img ...> tags.
790 string::size_type idxImg=0,
791 idx,pre_idx;
792 string dest;
793
794 do{
795 if( s.empty() ) break;
796 idxImg = CStrFun::FindCase(s, "<img");
797
798 pre_idx = idxImg;
799 if( pre_idx == string::npos) break;
800
801 s = s.substr(pre_idx);
802 idx = s.find_first_of('<',1);
803
804 if( idx != string::npos ){
805 dest = dest + s.substr(0,idx);
806 }else if (idx == string::npos && pre_idx != string::npos){
807 dest = dest + s;
808 break;
809 }else{
810 break;
811 }
812
813 s = s.substr(idx);
814 idxImg=0;
815 }while(1);
816
817 s = dest;
818 if( s.length() < 20 ) return false;
819
820 // erase all '"'. '\''," ".
821 CStrFun::EraseStr(s , "\"");
822 CStrFun::EraseStr(s , "'");
823 CStrFun::EraseStr(s , " ");
824
825 // Keep URLs and anchor text.
826
827 idxImg=0;
828 string::size_type idxSrc = 0;
829 const string delims( " #>");
830 dest.clear();
831
832
833 /*
834
835 通过上面的提取我们得到<img src="http://www.google.com.hk">
836
837 我们再次提取
838
839 m_sLinkInfo4History="http://www.google.com.hk>
840
841 */
842
843 do{
844 if( s.empty() ) break;
845 idxImg = CStrFun::FindCase(s, "img");
846
847 if( idxImg == string::npos) break;
848 pre_idx = idxImg;
849
850 s = s.substr(idxImg+3); // skip "img"
851
852 //####
853 idx = s.find('>', idxImg);
854 if( idxImg == string::npos) break;
855 if( s.empty() ) break;
856 idxSrc = CStrFun::FindCase(s, "src");
857 if( idxSrc > idxImg ) continue;
858 s = s.substr(idxSrc);
859
860 idx = s.find('=', idxImg);
861 if( idx == string::npos ) break;
862 s = s.substr(idx+1);
863
864 while( s.length() > 0 && s[0] == ' ' ){
865 s.erase(0,1);
866 }
867 if( s.length() == 0 ) break;
868
869 idx = s.find_first_of(delims,1);
870 if( idx == string::npos ) break;
871
872 if( s.at(0) == '"'){
873 dest += s.substr(0, idx);
874 }else{
875 dest += '"' + s.substr(0, idx);
876 }
877
878 idx = s.find('>');
879 if( idx == string::npos ) break;
880 dest += '>';
881 s = s.substr(idx +1);
882
883 idx = s.find('<');
884 if( idx == string::npos ){
885 dest += s;
886 break;
887 }
888 dest += s.substr(0,idx);
889 //####
890
891 idxImg=0;
892 }while(1);
893
894
895 // look for empty filenames.
896 idx = 0;
897 while( (idx = dest.find("\"\"",idx)) != string::npos ){
898 dest.erase(idx, 1);
899 }
900
901 s = dest;
902
903 return( s.length() < 20 ? false: true );
904
905 }
906
907
908
909
910 //判断strUrl是不是正规的url
911 bool CPage::NormalizeUrl(string& strUrl)
912 {
913 string::size_type idx;
914
915
916 //URL没有htp://协议名我们这里认为strUrl不是正规的URL
917 if( CStrFun::FindCase(strUrl, "http://") == string::npos ) return false;
918
919 // convert "http://e.pku.cn" to "http://e.pku.cn/"
920 //将http://www.baidu.com转化为http://www.baidu.com/
921 idx = strUrl.rfind('/');
922 if( idx < 8 ) {
923 strUrl = strUrl + "/";
924 return true;
925 }
926
927 //将"/./"-->"/"
928 while( (idx=strUrl.find("/./")) != string::npos ){
929 if( idx != string::npos ) strUrl.erase(idx,2);
930 }
931
932 //将"xxx/x/../yyy"-->xxx/yyy
933 while( (idx = strUrl.find("/../")) != string::npos ){
934 string strPre,strSuf;
935
936 strPre = strUrl.substr(0, idx);
937
938 if( strUrl.length() > idx+4 )
939 strSuf = strUrl.substr(idx+4);
940
941 idx = strPre.rfind("/");
942 if( idx != string::npos)
943 strPre = strPre.substr(0,idx+1);
944 if( strPre.length() < 10 ) return false;
945
946 strUrl = strPre + strSuf;
947 }
948
949 if( CStrFun::FindCase(strUrl, "http://") != 0 ) return false;
950
951 return true;
952 }
953
954
955
956
957
958
959
960 /*最终得到为搜索引擎准备的超链接
961
962 并将相对路径的URL和绝对路径的URL分别处理,同时,我们发现从一个网页中提取的超链接可以是相同的,这个时候
963
964 我们必须去重,这个函数用map容器很好的做到了这一点
965
966 还有一些URL不是正规的URL也要过滤
967
968 还有一些URL是必要过滤也要过滤--通过IsFilterLink(string strUrl)实现
969
970 */
971 bool CPage::FindRefLink4SE()
972 {
973 if( m_sLinkInfo4SE.empty() ) return false;
974
975 char *buffer = (char*)m_sLinkInfo4SE.c_str();
976 int urlnum=0,len;
977 char *ptr ;
978
979 static char buf[URL_REFERENCE_LEN];
980
981 memset(buf, 0, URL_REFERENCE_LEN);
982 len = strlen(buffer);
983 if( len < 8 ) return false;
984
985 len = len < URL_REFERENCE_LEN -1 ? len : URL_REFERENCE_LEN - 1;//len记录相对较小的值
986 strncpy( buf, buffer, len);
987
988 /*first
989 *------>
990 */
991
992
993 /*
994
995 例如:m_sLinkInfo4SE="http://www.baidu.com/">百度"http://www.qq.com/">QQ
996
997 我们这里提取为
998
999 http://www.baidu.com 百度
1000
1001 http://www.qqq.com QQ
1002
1003 */
1004 ptr = buf;
1005 while( ptr - buf < len && *ptr )
1006 {
1007 while( *ptr == '"' && *ptr) ptr++;
1008 if ( !*ptr ) break;
1009 this->m_RefLink4SE[ urlnum].link = ptr;//每个网页里最多有1000个链接
1010 while( *ptr && *ptr != '>')
1011 {
1012 //在遇到'>'之前,出现了' '字符,我们必须将' '字符赋值为'\0'说明URL提取完了,因为URL不可能出现' '字符
1013 if(*ptr == ' ') *ptr = '\0';
1014 //例如: "http://www.baidu.com/" height=100 width=150>百度 出现空格说明还有其他的属性值
1015 ptr++;
1016 }
1017
1018 if ( !*ptr ){
1019 urlnum++;
1020 break;
1021 }
1022 if ( *ptr == '>' )
1023 {
1024 *ptr++='\0';
1025 if( !*ptr )
1026 {
1027 urlnum++;
1028 break;
1029 }
1030
1031 if( *ptr == '"' )
1032 {
1033 this->m_RefLink4SE[urlnum].anchor_text = NULL;
1034 }
1035 else
1036 {
1037 this->m_RefLink4SE[urlnum].anchor_text = ptr;
1038 while( *ptr && *ptr != '"') ptr++;
1039 if (!*ptr)
1040 {
1041 urlnum++;
1042 break;
1043 }
1044 if ( *ptr == '"') *ptr='\0';
1045 }
1046
1047 }
1048
1049 //cout << endl << this->m_RefLink4SE[ urlnum].link << '\t';
1050 //cout << this->m_RefLink4SE[ urlnum].anchor_text << endl;
1051
1052 ptr++;
1053 urlnum++;
1054 if ( urlnum == MAX_URL_REFERENCES) break; //达到最多的url数目
1055 }
1056 //cout << endl << this->m_RefLink4SE[ urlnum].link << endl;
1057 //cout << this->m_RefLink4SE[ urlnum].anchor_text << endl;
1058
1059 this->m_nRefLink4SENum = urlnum;
1060
1061 /*second
1062 *------>
1063 */
1064 //typedef map<string,string,less<string> >::value_type valType;
1065 typedef map<string,string>::value_type valType;
1066
1067 m_mapLink4SE.clear();
1068
1069 //string strRootUrl= m_sUrl;
1070 CUrl iUrl;
1071 if( iUrl.ParseUrlEx(m_sUrl) == false )
1072 {
1073 cout << "ParseUrlEx error in FindRefLink4SE(): " << m_sUrl << endl;
1074 return false;
1075 }
1076
1077 for(int i=0; i<m_nRefLink4SENum; i++)
1078 {
1079
1080 string str;
1081 string::size_type idx;
1082 const string delims(" #");
1083
1084 str = m_RefLink4SE[i].link;
1085 idx = str.find_first_of(delims, 0 );
1086 if( idx != string::npos )//如果找到标志
1087 {
1088 str = str.substr(0, idx);//只取#前边的url
1089 }
1090 if( str.size() == 0 || str.size() > URL_LEN - 1 || str.size() < 4 )
1091 continue;
1092
1093
1094 string::size_type idx1;
1095 idx1 = CStrFun::FindCase(str, "http");
1096 if( idx1 != 0 )//str有可能是相对路径
1097 {
1098 char c1 = m_sUrl.at(m_sUrl.length()-1);
1099 char c2 = str.at(0);
1100
1101 if( c2=='/' )//str一定是相对路径
1102 {
1103 if( iUrl.m_nPort != 80 )//若是http
1104 {
1105 cout << iUrl.m_sHost << endl;
1106 cout << str << endl;
1107 //str = "http://" + iUrl.m_sHost + ":" + (const char*)(iUrl.m_nPort) + str;
1108 str = "http://" + iUrl.m_sHost + ":" + CStrFun::itos(iUrl.m_nPort) + str;
1109 }
1110 else
1111 {
1112 str = "http://" + iUrl.m_sHost + str;
1113 }
1114 }
1115 else if( c1!='/' && c2!='/')//若两个都不是,则加上/构成新的url
1116 {
1117 string::size_type idx;
1118
1119 idx = m_sUrl.rfind('/');
1120 if( idx != string::npos )//若不是最后
1121 {
1122 if( idx > 6 )
1123 { // > strlen("http://..")
1124 str = m_sUrl.substr(0, idx+1) + str;
1125 }
1126 else
1127 {
1128 str = m_sUrl + "/" + str;
1129 }
1130
1131 } else {
1132
1133 continue;
1134 }
1135
1136 }
1137 else
1138 {
1139 if( c1=='/' )
1140 {
1141 str = m_sUrl + str;
1142 }
1143 else
1144 {
1145 str = m_sUrl + "/" + str;
1146 }
1147 }
1148 }
1149
1150 if( NormalizeUrl(str) == false ) continue;
1151
1152 if( IsFilterLink(str) ) continue;
1153
1154 //debug
1155 //cout << "reflink: " << reflink << endl;
1156
1157 if( str == m_sUrl )//一个网页中提取的超链接是其本身,我就不要了,因为我们已经有了这个网页的URL了
1158 {
1159 continue;
1160 }
1161 else
1162 {
1163 if( m_RefLink4SE[i].anchor_text )//有URL的描述符
1164 {
1165 if( m_mapLink4SE.find(str) == m_mapLink4SE.end() )
1166 {
1167 m_mapLink4SE.insert( valType( str, m_RefLink4SE[i].anchor_text));
1168 }
1169 }
1170 else//没有URL的描述符---这个时候描述符为'\0'
1171 {
1172 if( m_mapLink4SE.find(str) == m_mapLink4SE.end() )
1173 {
1174 m_mapLink4SE.insert( valType( str, "\0") );
1175 cout << ".";
1176 }
1177 }
1178 }
1179
1180
1181 }
1182
1183 m_nRefLink4SENum = m_mapLink4SE.size();
1184
1185 //cout << endl;
1186
1187 return true;
1188 }
1189
1190
1191
1192
1193
1194 //最终得到为历史网页存档准备的超链接
1195
1196 //并将相对路径的URL和绝对路径的URL分别处理,同时,我们发现从一个网页中提取的超链接可以是相同的,这个时候
1197
1198 //我们必须去重,这个函数用vector容器很好的做到了这一点
1199
1200 //还有一些URL不是正规的URL也要过滤
1201
1202 //还有一些URL是必要过滤也要过滤--通过IsFilterLink(string strUrl)实现
1203 bool CPage::FindRefLink4History()
1204 {
1205 if( m_sLinkInfo4History.empty() ) return false;
1206
1207 char *buffer = (char*)m_sLinkInfo4History.c_str();
1208 int urlnum=0,len;
1209 char *ptr ;
1210
1211 static char buf[URL_REFERENCE_LEN/2];
1212
1213 memset(buf, 0, URL_REFERENCE_LEN/2);
1214 len = strlen(buffer);
1215 if( len < 8 ) return false;
1216
1217 len = len < URL_REFERENCE_LEN/2 - 1? len : URL_REFERENCE_LEN/2 -1;
1218 strncpy( buf, buffer, len);
1219
1220 /*first
1221 *------>
1222 */
1223 ptr = buf;
1224 while( ptr - buf < len && *ptr ){
1225 while( *ptr == '"' && *ptr) ptr++;
1226 if ( !*ptr ) break;
1227 this->m_RefLink4History[ urlnum].link = ptr;
1228
1229 while( *ptr && *ptr != '>'){
1230 if( *ptr == ' ') *ptr='\0';
1231 ptr++;
1232 }
1233
1234 if( !*ptr){
1235 urlnum++;
1236 break;
1237 }
1238 if( *ptr == '>' ){
1239 *ptr++ = 0;
1240 if( !*ptr ){
1241 urlnum++;
1242 break;
1243 }
1244 if( *ptr == '"' ){
1245
1246 }else{
1247 while( *ptr && *ptr != '"') ptr++;
1248 if( !*ptr ){
1249 urlnum++;
1250 break;
1251 }
1252 if ( *ptr == '"' ) *ptr++='\0';
1253 }
1254 }
1255
1256 ptr++;
1257 urlnum++;
1258 if ( urlnum == MAX_URL_REFERENCES/2) break;
1259 }
1260
1261
1262 this->m_nRefLink4HistoryNum = urlnum;
1263
1264 /*second
1265 *------>
1266 */
1267 m_vecLink4History.clear();
1268 //string strRootUrl= m_sUrl;
1269 CUrl iUrl;
1270 if( iUrl.ParseUrlEx(m_sUrl) == false ){
1271 cout << "ParseUrlEx error in FindRefLink4History(): " << m_sUrl << endl;
1272 return false;
1273 }
1274
1275 for(int i=0; i<m_nRefLink4HistoryNum; i++){
1276 string str;
1277 //string::size_type idx;
1278
1279 str = m_RefLink4History[i].link;
1280 if( str.size()==0 || str.size() > URL_LEN - 1
1281 || str.size() < 4 ) continue;
1282
1283 /*
1284 char *pdest1, *pdest2;
1285 pdest1 = strstr( str.c_str(), "http" );
1286 pdest2 = strstr( str.c_str(), "HTTP" );
1287 if( pdest1==NULL && pdest2==NULL ){
1288 */
1289
1290 string::size_type idx1;
1291 idx1 = CStrFun::FindCase(str, "http");
1292 if( idx1 != 0 ){
1293 char c1 = m_sUrl.at(m_sUrl.length()-1);
1294 char c2 = str.at(0);
1295
1296 if( c2=='/' ){
1297 if( iUrl.m_nPort != 80 ){
1298 str = "http://" + iUrl.m_sHost + ":" + CStrFun::itos(iUrl.m_nPort) + str;
1299 } else {
1300 str = "http://" + iUrl.m_sHost + str;
1301 }
1302 } else if( c1!='/' && c2!='/'){
1303 string::size_type idx;
1304
1305 idx = m_sUrl.rfind('/');
1306 if( idx != string::npos ){
1307 if( idx > 6 ){ // > strlen("http://..")
1308 str = m_sUrl.substr(0, idx+1) + str;
1309 } else {
1310 str = m_sUrl + "/" + str;
1311 }
1312
1313 } else {
1314
1315 continue;
1316 }
1317
1318 } else {
1319 if( c1=='/' ){
1320 str = m_sUrl + str;
1321 } else {
1322 str = m_sUrl + "/" + str;
1323 }
1324 }
1325 }
1326
1327 // due to bad link parser
1328 /*
1329
1330 idx = reflink.find(' ');
1331 if(idx != string::npos){
1332 reflink = reflink.substr(0,idx);
1333 }
1334 idx = reflink.find('"');
1335 if(idx != string::npos){
1336 reflink = reflink.substr(0,idx);
1337 }
1338 */
1339 //#############
1340
1341 if( NormalizeUrl(str) == false ) continue;
1342
1343
1344 if( IsFilterLink(str) ) continue;
1345
1346
1347 if( str == m_sUrl ){
1348 continue;
1349 }else{
1350 vector<string>::iterator it;
1351 it = find(m_vecLink4History.begin(), m_vecLink4History.end(),str);
1352 if( it == m_vecLink4History.end() ){
1353
1354 m_vecLink4History.push_back( str);
1355 cout << ".";
1356 }
1357 }
1358
1359
1360 }
1361 m_nRefLink4HistoryNum = m_vecLink4History.size();
1362 //cout << endl;
1363
1364 return true;
1365 }