1 #ifndef _HTTP_H_031105_ 2 #define _HTTP_H_031105_ 3 4 #include <map> 5 6 using namespace std; 7 8 class CHttp 9 { 10 private: 11 string m_strUrl; // url 12 int *m_sock; // socket 13 14 public: 15 CHttp(); 16 virtual ~CHttp(); 17 18 19 //strUrl: 待抓取的网页对应的URL 20 //fileBuf: 网页体信息 21 //fileHead:网页头信息 22 //location:网页如果重定向对应的URL 23 //sock:套接子文件描述符 24 int Fetch(string strUrl, char **fileBuf, 25 char **fileHead, char **location, int* sock); 26 27 private: 28 //下面4个私有的成员函数--被 Fetch()函数调用 29 //通过IO复用的方法读取网页头信息 30 int read_header(int sock, char *headerPtr); 31 32 //创建套接字文件描述符 33 int CreateSocket(const char *host, int port); 34 35 //被CreateSocket()调用,通过IO复用的方法连接目标服务器 36 int nonb_connect(int, struct sockaddr*, int); 37 38 //检测*buf所指的内存空间剩余值是否大于more,不够再加more+1单位的内存空间 39 int checkBufSize(char **buf, int *bufsize, int more); 40 41 }; 42 43 extern pthread_mutex_t mutexMemory; 44 45 #endif /* _HTTP_H_031105_ */
1 #include <stdlib.h> 2 #include <stdio.h> 3 #include <string.h> 4 #include <strings.h> 5 #include <errno.h> 6 #include <netdb.h> 7 #include <unistd.h> 8 #include <netinet/in.h> 9 #include <sys/types.h> 10 #include <sys/socket.h> 11 #include <sys/time.h> 12 #include <fcntl.h> 13 #include <iostream> 14 #include "Http.h" 15 16 //#include "Tse.h" 17 #include "CommonDef.h" 18 #include "Url.h" 19 //#include "Page.h" 20 #include "StrFun.h" 21 22 char *userAgent = NULL; 23 int timeout = DEFAULT_TIMEOUT;//设置最长的等待时间30秒 24 int hideUserAgent = 0; 25 26 CHttp::CHttp() 27 { 28 } 29 30 CHttp::~CHttp() 31 { 32 } 33 34 35 /* 36 * Actually downloads the page, registering a hit (donation) 37 * If the fileBuf passed in is NULL, the url is downloaded and then 38 * freed; otherwise the necessary space is allocated for fileBuf. 39 * Returns size of download on success, 40 -1 on error is set, 41 -2 out of ip block, 42 -3 invalid host, 43 -4 MIME is imag/xxx 44 -300 on 301. 45 */ 46 47 48 49 /* 50 51 function: 52 53 success: return bytesRead[网页体信息的真实的字节数] 54 55 fail: return -1 各种其他的错误 56 57 return -2 在IP阻塞范围内 58 59 return -3 无效的主机号 60 61 return -4 image/text类型 62 63 return -300 网页重定向 64 65 strUrl: 待抓取的网页对应的URL 66 67 fileBuf: 网页体信息 68 69 fileHead:网页头信息 70 71 location:网页如果重定向对应的URL 72 73 sock:套接子文件描述符 74 75 */ 76 int CHttp::Fetch(string strUrl, char **fileBuf, char **fileHeadBuf, char **location, int* nPSock ) 77 { 78 char *tmp, *url, *requestBuf, *pageBuf; 79 const char *host, *path; 80 int sock, bytesRead = 0, bufsize = REQUEST_BUF_SIZE; 81 int ret = -1, tempSize, selectRet; 82 int port = 80; 83 84 85 if( strUrl.empty() )//空的URL肯定不能抓取到网页 86 { 87 cout << "strUrl is NULL" << endl; 88 return -1; 89 } 90 91 /* Copy the url passed in into a buffer we can work with, change, etc. */ 92 /* 93 url = (char*)malloc(strUrl.length()+1); 94 if( url == NULL ){ 95 cout << "can not allocate enought memory for url" << endl; 96 return -1; 97 } else { 98 memset(url, 0,strUrl.length()+1); 99 memcpy(url, strUrl.c_str(), strUrl.length() ); 100 } 101 */ 102 //pthread_mutex_lock(&mutexMemory); 103 url = strdup(strUrl.c_str());//复制url 104 //pthread_mutex_unlock(&mutexMemory); 105 if( url == NULL )//分配失败 106 { 107 cout << "!error: stdup() in Fetch()" << endl; 108 return -1; 109 } 110 111 // parse the url 112 CUrl u; 113 if( u.ParseUrlEx(url) == false ) 114 { 115 //如果没有"http://"协议号,肯定会解析错误 116 cout << "ParseUrlEx error in Fetch(): " << strUrl << endl; 117 return -1; 118 } 119 120 host = u.m_sHost.c_str(); 121 path = u.m_sPath.c_str(); 122 if( u.m_nPort > 0 ) port = u.m_nPort; 123 124 /* Compose a request string */ 125 //pthread_mutex_lock(&mutexMemory); 126 127 /*构造HTTP请求报文: 假设strUrl="http://www.baidu.com/ecjtu/nihao.html"*/ 128 // GET /ecjtu/nihao.html HTTP/1.0\r\n 129 requestBuf = (char*)malloc(bufsize); 130 //pthread_mutex_unlock(&mutexMemory); 131 if(requestBuf == NULL) 132 { 133 if (url) 134 { 135 //pthread_mutex_lock(&mutexMemory); 136 free(url); 137 url=NULL; 138 //pthread_mutex_unlock(&mutexMemory); 139 } 140 cout << "can not allocate enought memory for requestBuf" << endl; 141 return -1; 142 } 143 requestBuf[0] = 0; 144 145 if( strlen(path) < 1 )//说明请求的是根目录下的网页 146 { 147 // GET / HTTP/1.0\r\n 148 /* The url has no '/' in it, assume the user is making a root-level 149 * request */ 150 tempSize = strlen("GET /") + strlen(HTTP_VERSION) +2; 151 /* 152 if( tempSize > bufsize ){ 153 free(url); 154 free(requestBuf); 155 cout << "tempSize larger than bufsize" << endl; 156 return -1; 157 } 158 */ 159 160 if(checkBufSize(&requestBuf, &bufsize, tempSize) || snprintf(requestBuf, bufsize, "GET / %s\r\n", HTTP_VERSION) < 0 ){ 161 /*int snprintf(char *restrict buf, size_t n, const char * restrict format, ...); 162 函数说明:最多从源串中拷贝n-1个字符到目标串中,然后再在后面加一个0。所以如果目标串的大小为n 163 的话,将不会溢出。*/ 164 165 //pthread_mutex_lock(&mutexMemory); 166 if (url) 167 { 168 free(url); 169 url=NULL; 170 } 171 if (requestBuf) 172 { 173 free(requestBuf); 174 requestBuf=NULL; 175 } 176 //pthread_mutex_unlock(&mutexMemory); 177 cout << "1.checkBuffSize(&requestBuf..) error" << endl; 178 return -1; 179 } 180 181 } 182 else//说明请求的是非根目录下的网页 183 { 184 tempSize = strlen("GET ") + strlen(path) + strlen(HTTP_VERSION) + 4; 185 186 if(checkBufSize(&requestBuf, &bufsize, tempSize) || snprintf(requestBuf, bufsize, "GET %s %s\r\n", path, HTTP_VERSION) < 0) 187 { 188 189 //pthread_mutex_lock(&mutexMemory); 190 if (url) 191 { 192 free(url); 193 url=NULL; 194 } 195 if (requestBuf) 196 { 197 free(requestBuf); 198 requestBuf=NULL; 199 } 200 //pthread_mutex_unlock(&mutexMemory); 201 cout << "2._checkBuffSize(&requestBuf..) error" << endl; 202 return -1; 203 } 204 205 } 206 207 208 /* Use Host: even though 1.0 doesn't specify it. Some servers 209 * won't play nice if we don't send Host, and it shouldn't hurt anything */ 210 tempSize = (int)strlen("Host: ") + (int)strlen(host) + 3;/* +3 for "\r\n\0" */ 211 212 if(checkBufSize(&requestBuf, &bufsize, tempSize + 128)){ 213 //pthread_mutex_lock(&mutexMemory); 214 if (url) 215 { 216 free(url); url=NULL; 217 } 218 if (requestBuf) 219 { 220 free(requestBuf); requestBuf=NULL; 221 } 222 //pthread_mutex_unlock(&mutexMemory); 223 cout << "3._checkBuffSize(&requestBuf..) error" << endl; 224 return -1; 225 } 226 227 strcat(requestBuf, "Host: "); 228 strcat(requestBuf, host); 229 strcat(requestBuf, "\r\n"); 230 231 if(!hideUserAgent && userAgent == NULL) { 232 233 tempSize = (int)strlen("User-Agent: ") + 234 (int)strlen(DEFAULT_USER_AGENT) + (int)strlen(VERSION) + 4; 235 if(checkBufSize(&requestBuf, &bufsize, tempSize)) { 236 //pthread_mutex_lock(&mutexMemory); 237 if (url) 238 { 239 free(url); url=NULL; 240 } 241 if (requestBuf) 242 { 243 free(requestBuf); requestBuf=NULL; 244 } 245 //pthread_mutex_unlock(&mutexMemory); 246 cout << "4._checkBuffSize(&requestBuf..) error" << endl; 247 return -1; 248 } 249 strcat(requestBuf, "User-Agent: "); 250 strcat(requestBuf, DEFAULT_USER_AGENT); 251 strcat(requestBuf, "/"); 252 strcat(requestBuf, VERSION); 253 strcat(requestBuf, "\r\n"); 254 255 } else if(!hideUserAgent) { 256 257 tempSize = (int)strlen("User-Agent: ") + (int)strlen(userAgent) + 3; 258 if(checkBufSize(&requestBuf, &bufsize, tempSize)) { 259 260 //pthread_mutex_lock(&mutexMemory); 261 if (url) 262 { 263 free(url); url=NULL; 264 } 265 if (requestBuf) 266 { 267 free(requestBuf); requestBuf=NULL; 268 } 269 //pthread_mutex_unlock(&mutexMemory); 270 cout << "5._checkBuffSize(&requestBuf..) error" << endl; 271 return -1; 272 } 273 strcat(requestBuf, "User-Agent: "); 274 strcat(requestBuf, userAgent); 275 strcat(requestBuf, "\r\n"); 276 } 277 278 //tempSize = (int)strlen("Connection: Close\n\n"); 279 tempSize = (int)strlen("Connection: Keep-Alive\r\n\r\n"); 280 if(checkBufSize(&requestBuf, &bufsize, tempSize)) { 281 //pthread_mutex_lock(&mutexMemory); 282 if (url) 283 { 284 free(url); url=NULL; 285 } 286 if (requestBuf) 287 { 288 free(requestBuf); requestBuf=NULL; 289 } 290 //pthread_mutex_unlock(&mutexMemory); 291 cout << "6._checkBuffSize(&requestBuf..) error" << endl; 292 return -1; 293 } 294 295 296 //strcat(requestBuf, "Connection: Close\n\n"); 297 strcat(requestBuf, "Connection: Keep-Alive\r\n\r\n"); 298 299 300 /* Now free any excess memory allocated to the buffer */ 301 //pthread_mutex_lock(&mutexMemory); 302 //重新调整requestBuf的内存空间,释放多余的内存空间 303 tmp = (char *)realloc(requestBuf, strlen(requestBuf) + 1); 304 //pthread_mutex_unlock(&mutexMemory); 305 if(tmp == NULL){ 306 //pthread_mutex_lock(&mutexMemory); 307 if (url) 308 { 309 free(url); url=NULL; 310 } 311 if (requestBuf) 312 { 313 free(requestBuf); requestBuf=NULL; 314 } 315 //pthread_mutex_unlock(&mutexMemory); 316 cout << "realloc for tmp error" << endl; 317 return -1; 318 } 319 requestBuf = tmp; 320 321 if( *nPSock != -1 ){ 322 sock = *nPSock; 323 cout << "using privous socket " << *nPSock << endl; 324 }else{ 325 326 // cout << "1.get a new one" << endl; 327 sock = CreateSocket( host, port ); 328 if(sock == -1) { // invalid host 329 //pthread_mutex_lock(&mutexMemory); 330 if (url) 331 { 332 free(url); url=NULL; 333 } 334 if (requestBuf) 335 { 336 free(requestBuf); requestBuf=NULL; 337 } 338 //pthread_mutex_unlock(&mutexMemory); 339 return -3; 340 } 341 if(sock == -2) { // out of ip block 342 //pthread_mutex_lock(&mutexMemory); 343 if (url) 344 { 345 free(url); url=NULL; 346 } 347 if (requestBuf) 348 { 349 free(requestBuf); requestBuf=NULL; 350 } 351 //pthread_mutex_unlock(&mutexMemory); 352 //cout << "2.not able to MakeSocket" << endl; 353 return -2; 354 } 355 } 356 357 358 359 ret = write(sock, requestBuf, strlen(requestBuf)); 360 if( ret == 0 ){ 361 cout << "requestBuf is " << requestBuf << endl; 362 cout << "write nothing" << endl; 363 //pthread_mutex_lock(&mutexMemory); 364 if (url) 365 { 366 free(url); url=NULL; 367 } 368 if (requestBuf) 369 { 370 free(requestBuf); requestBuf=NULL; 371 } 372 //pthread_mutex_unlock(&mutexMemory); 373 close(sock); 374 *nPSock = -1; 375 return -1; 376 377 } 378 if( ret == -1){ 379 //cout << "write error" << endl; 380 // sock is invalid,we should make a new one 381 close(sock); 382 *nPSock = -1; 383 384 cout << "2.close previous socket " << *nPSock << " and get a new one" << endl; 385 //maybe sock is dead,try again 386 sock = CreateSocket( host, port ); 387 if(sock == -1) { 388 //pthread_mutex_lock(&mutexMemory); 389 if (url) 390 { 391 free(url); url=NULL; 392 } 393 if (requestBuf) 394 { 395 free(requestBuf); requestBuf=NULL; 396 } 397 //pthread_mutex_unlock(&mutexMemory); 398 cout << "3.not able to MakeSocket" << endl; 399 return -1; 400 } 401 if(sock == -2) { 402 //pthread_mutex_lock(&mutexMemory); 403 if (url) 404 { 405 free(url); url=NULL; 406 } 407 if (requestBuf) 408 { 409 free(requestBuf); requestBuf=NULL; 410 } 411 //pthread_mutex_unlock(&mutexMemory); 412 cout << "4.not able to MakeSocket" << endl; 413 return -1; 414 } 415 if(write(sock, requestBuf, strlen(requestBuf)) == -1){ 416 //pthread_mutex_lock(&mutexMemory); 417 if (url) 418 { 419 free(url); url=NULL; 420 } 421 if (requestBuf) 422 { 423 free(requestBuf); requestBuf=NULL; 424 } 425 //pthread_mutex_unlock(&mutexMemory); 426 close(sock); 427 *nPSock = -1; 428 cout << "write error" << endl; 429 return -1; 430 } 431 } 432 433 //pthread_mutex_lock(&mutexMemory); 434 if (url) 435 { 436 free(url); url=NULL; 437 } 438 if (requestBuf) 439 { 440 free(requestBuf); requestBuf=NULL; 441 } 442 //pthread_mutex_unlock(&mutexMemory); 443 444 445 char headerBuf[HEADER_BUF_SIZE]; 446 /* Grab enough of the response to get the metadata */ 447 memset( headerBuf,0,HEADER_BUF_SIZE ); 448 //cout << "old sock is " << sock << endl; 449 ret = read_header(sock, headerBuf); 450 //cout << "ret = " << ret << endl; 451 if(ret < 0) { 452 close(sock); 453 *nPSock = -1; 454 return -1; 455 } 456 457 //cout << headerBuf << endl; 458 if( strlen(headerBuf) == 0 ){ 459 cout << "strlen(headerBuf) = 0" << headerBuf << endl; 460 cout << "strUrl: " << strUrl << endl << endl;; 461 close(sock); 462 *nPSock = -1; 463 return -1; 464 } 465 466 467 468 //解析网页头信息 469 CPage iPage; 470 iPage.ParseHeaderInfo(headerBuf); 471 if (iPage.m_nStatusCode == -1) 472 { 473 close(sock); 474 *nPSock = -1; 475 cout << "headerBuf: " << headerBuf << endl; 476 cout << "!header error: not find HTTP" << endl; 477 return -1; 478 } 479 480 481 482 // deal with http://net.cs.pku.edu.cn/~cnds 483 if (iPage.m_nStatusCode == 301 || iPage.m_nStatusCode == 302) 484 { 485 if (iPage.m_sLocation.empty() || iPage.m_sLocation.size()>URL_LEN) 486 { 487 close(sock); 488 *nPSock = -1; 489 cout << headerBuf << endl; 490 cout << "!error: Location" << endl; 491 return -1; 492 } 493 else 494 { 495 //pthread_mutex_lock(&mutexMemory); 496 char *loc=strdup(iPage.m_sLocation.c_str()); 497 //pthread_mutex_unlock(&mutexMemory); 498 *location = loc; 499 close(sock); 500 *nPSock = -1; 501 return -300;//重定向了 502 } 503 } 504 505 if(iPage.m_nStatusCode<200 || iPage.m_nStatusCode>299 ){ 506 close(sock); 507 *nPSock = -1; 508 cout << "!header code = " << iPage.m_nStatusCode << endl; 509 return -1; 510 } 511 512 // when crawling images for ImgSE, remember to comment the paragraph 513 // when crawling plain text for SE, remember to open the paragraph 514 // paragraph begin 515 if( iPage.m_sContentType.find("image") != string::npos ) 516 { // 517 close(sock); 518 *nPSock = -1; 519 return -4; 520 } 521 // paragraph end 522 523 if (iPage.m_nContentLength == -1) 524 { 525 close(sock); 526 *nPSock = -1; 527 cout << headerBuf << endl; 528 cout << "!error: Content-length" << endl; 529 return -1; 530 } 531 532 if (iPage.m_nContentLength==0 || iPage.m_nContentLength<20) 533 { // Allocate enough memory to hold the page 534 iPage.m_nContentLength = DEFAULT_PAGE_BUF_SIZE; 535 } 536 537 538 if (iPage.m_nContentLength > MAX_PAGE_BUF_SIZE) 539 { 540 cout<<"这个网页的长度大于5M,我过滤掉它!"<<endl; 541 cout << "the page discarded due to its size " 542 << iPage.m_nContentLength 543 << " is larger than " << MAX_PAGE_BUF_SIZE << endl; 544 close(sock); 545 *nPSock = -1; 546 return -1; 547 } 548 549 //pthread_mutex_lock(&mutexMemory); 550 pageBuf = (char *)malloc(iPage.m_nContentLength); 551 //pthread_mutex_unlock(&mutexMemory); 552 if(pageBuf == NULL){ 553 close(sock); 554 *nPSock = -1; 555 cout << "malloc for pageBuf" << endl; 556 return -1; 557 } 558 559 /* Begin reading the body of the file */ 560 //开始读取网页体信息 561 fd_set rfds; 562 struct timeval tv; 563 int flags; 564 //将sock套接子文件描述符设置为非阻塞的方式 565 flags=fcntl(sock,F_GETFL,0); 566 if(flags<0) 567 { 568 close(sock); 569 *nPSock = -1; 570 if (pageBuf) 571 { 572 //pthread_mutex_lock(&mutexMemory); 573 free(pageBuf); 574 pageBuf=NULL; 575 //pthread_mutex_unlock(&mutexMemory); 576 } 577 cout << "1.fcntl() error " << endl; 578 return -1; 579 } 580 581 582 flags|=O_NONBLOCK; 583 if(fcntl(sock,F_SETFL,flags)<0){ 584 close(sock); 585 *nPSock = -1; 586 if (pageBuf) 587 { 588 free(pageBuf); pageBuf=NULL; 589 } 590 cout << "2.fcntl() error " << endl; 591 return -1; 592 } 593 594 595 //挂一个while()循环读取网页体信息 596 int pre_ret=0; 597 while(ret > 0) 598 { 599 FD_ZERO(&rfds);//清理rfds读文件描述符集合 600 FD_SET(sock, &rfds);//将sock加到rfds读文件描述符集合中 601 if( bytesRead == iPage.m_nContentLength ) 602 { 603 tv.tv_sec = 1; 604 } 605 else 606 { 607 tv.tv_sec = timeout; 608 } 609 tv.tv_usec = 0; 610 611 if(DEFAULT_TIMEOUT >= 0) 612 selectRet = select(sock+1, &rfds, NULL, NULL, &tv);//IO复用 613 else /* No timeout, can block indefinately */ 614 selectRet = select(sock+1, &rfds, NULL, NULL, NULL); 615 616 if(selectRet == 0 && timeout < 0)//超时 617 { 618 close(sock); 619 *nPSock = -1; 620 if (pageBuf) 621 { 622 //pthread_mutex_lock(&mutexMemory); 623 free(pageBuf); 624 pageBuf=NULL; 625 //pthread_mutex_unlock(&mutexMemory); 626 } 627 cout << "selectRet == 0 && timeout < 0" << endl; 628 return -1; 629 } 630 else if(selectRet == -1)//select()函数出错 631 { 632 close(sock); 633 *nPSock = -1; 634 if (pageBuf) 635 { 636 free(pageBuf); 637 pageBuf=NULL; 638 } 639 cout << "selectRet == -1" << endl; 640 return -1; 641 } 642 643 //每次最多接收iPage.m_nContentLength字节--缓冲区的大小为iPage.m_nContentLength 644 ret = read(sock, pageBuf + bytesRead, iPage.m_nContentLength); 645 //ret = read(sock, (char*)pageBuf.c_str() + bytesRead, iPage.m_nContentLength); 646 647 if(ret == 0) break; 648 if(ret == -1 && pre_ret==0)//read()函数出错 649 { 650 close(sock); 651 *nPSock = -1; 652 if (pageBuf) 653 { 654 //pthread_mutex_lock(&mutexMemory); 655 free(pageBuf); pageBuf=NULL; 656 //pthread_mutex_unlock(&mutexMemory); 657 } 658 cout << "read()'s retval=-1" << endl; 659 return -1; 660 } 661 else if( ret == -1 && pre_ret ) 662 { 663 //cout << "2. pre_ret = " << pre_ret << endl; 664 /* 665 if( bytesRead < iPage.m_nContentLength){ // meaning we lost the connection too soon 666 cout << "lost the connection too soon" << endl; 667 freeOpageBuf); 668 return -1; 669 } 670 */ 671 break; 672 } 673 674 pre_ret = ret; 675 //cout << "1.pre_ret = " << pre_ret << endl; 676 677 bytesRead += ret; 678 679 680 /* To be tolerant of inaccurate Content-Length fields, we'll 681 * allocate another read-sized chunk to make sure we have 682 * enough room. 683 */ 684 if(ret > 0) { 685 //pthread_mutex_lock(&mutexMemory); 686 pageBuf = (char *)realloc(pageBuf, bytesRead + iPage.m_nContentLength); 687 //pthread_mutex_unlock(&mutexMemory); 688 if(pageBuf == NULL) { 689 close(sock); 690 *nPSock = -1; 691 if (pageBuf) 692 { 693 //pthread_mutex_lock(&mutexMemory); 694 free(pageBuf); pageBuf=NULL; 695 //pthread_mutex_unlock(&mutexMemory); 696 } 697 cout << "realloc()" << endl; 698 return -1; 699 } 700 } 701 702 } 703 704 /* 705 * The download buffer is too large. Trim off the safety padding. 706 */ 707 708 //pthread_mutex_lock(&mutexMemory); 709 pageBuf = (char *)realloc(pageBuf, bytesRead+1); 710 //pthread_mutex_unlock(&mutexMemory); 711 if(pageBuf == NULL){ 712 close(sock); 713 *nPSock = -1; 714 if (pageBuf) 715 { 716 //pthread_mutex_lock(&mutexMemory); 717 free(pageBuf); pageBuf=NULL; 718 //pthread_mutex_unlock(&mutexMemory); 719 } 720 cout << "2.realloc()" << endl; 721 return -1; 722 } 723 724 725 pageBuf[bytesRead] = '\0'; 726 727 728 if(fileBuf == NULL){ /* They just wanted us to "hit" the url */ 729 if (pageBuf) 730 { 731 //pthread_mutex_lock(&mutexMemory); 732 free(pageBuf); pageBuf=NULL; 733 //pthread_mutex_unlock(&mutexMemory); 734 } 735 }else{ 736 737 738 739 char *tmp; 740 //tmp = (char *)malloc(HEADER_BUF_SIZE); 741 //pthread_mutex_lock(&mutexMemory); 742 tmp = (char *)malloc(strlen(headerBuf)+1); 743 //pthread_mutex_unlock(&mutexMemory); 744 if(tmp == NULL){ 745 close(sock); 746 *nPSock = -1; 747 if (pageBuf) 748 { 749 //pthread_mutex_lock(&mutexMemory); 750 free(pageBuf); pageBuf=NULL; 751 //pthread_mutex_unlock(&mutexMemory); 752 } 753 cout << "malloc() for headerBuf" << endl; 754 return -1; 755 } 756 //memcpy( tmp, headerBuf, HEADER_BUF_SIZE-1 ); 757 strncpy( tmp, headerBuf, strlen(headerBuf)+1 ); 758 *fileHeadBuf = tmp; 759 760 *fileBuf = pageBuf; 761 } 762 763 //close(sock); 764 *nPSock = sock; 765 return bytesRead; 766 } 767 768 769 770 771 772 /* 773 774 function: 创建套接字文件描述符,并且调用nonb_connect()同目标服务器进行连接 775 776 success: return sock[成功创建的套接子文件描述符] 777 778 fail: return -1 其他错误 779 780 return -2 在IP阻塞范围内 781 782 */ 783 int CHttp::CreateSocket(const char *host, int port) 784 { 785 int sock; // Socket descriptor 786 struct sockaddr_in sa; // Socket address 787 788 789 unsigned long inaddr; 790 int ret; 791 792 CUrl url; 793 char *ip = url.GetIpByHost(host);//通过主机号得到IP地址 794 795 if( ip == NULL )//获得失败 796 { // gethostbyname() error in GetIpByHost() 797 //cout << "invalid host: " << host << endl; 798 return -1; 799 800 } 801 else 802 { 803 // filter ip (decide whether it is inside the ip block) 804 if( url.IsValidIp(ip) )//在IP阻塞范围内 805 { 806 // inside 807 inaddr = (unsigned long)inet_addr(ip);//将字符串IP转化为32位的网络字节序 808 809 if( inaddr == INADDR_NONE ) 810 { 811 // release the buffer, be careful 812 //pthread_mutex_lock(&mutexMemory); 813 delete [] ip; ip = NULL; 814 //pthread_mutex_unlock(&mutexMemory); 815 cout << "invalid ip " << ip << endl; 816 return -1; 817 } 818 819 memcpy((char *)&sa.sin_addr, (char *)&inaddr, sizeof(inaddr)); 820 821 // release the buffer, be carful 822 //pthread_mutex_lock(&mutexMemory); 823 delete [] ip; ip = NULL; 824 //pthread_mutex_unlock(&mutexMemory); 825 826 } 827 else//在IP阻塞范围外 828 { // out of ip block 829 // release the buffer, be carful 830 //pthread_mutex_lock(&mutexMemory); 831 delete [] ip; ip = NULL; 832 //pthread_mutex_unlock(&mutexMemory); 833 //cout << "out of ip block: " << host << endl; 834 return -2; 835 } 836 } 837 838 839 /* Copy host address from hostent to (server) socket address */ 840 sa.sin_family = AF_INET; 841 sa.sin_port = htons(port); /* Put portnum into sockaddr */ 842 843 sock = -1; 844 sock = socket(AF_INET, SOCK_STREAM, 0);//创建套接字文件描述符 845 if(sock < 0 ) //创建失败 846 { 847 cout << "socket() in CreateSocket" << endl; 848 return -1; 849 } 850 851 int optval = 1; 852 if (setsockopt (sock, SOL_SOCKET, SO_REUSEADDR,(char *)&optval, sizeof (optval)) < 0) 853 //SOL_SOCKET 通用套接字选项 854 //SO_REUSEADDR 表示允许本地地址重用 855 { 856 857 cout << "setsockopt() in CreateSocket" << endl; 858 close(sock); 859 return -1; 860 } 861 862 //ret = connect(sock, (struct sockaddr *)&sa, sizeof(sa)); 863 ret = nonb_connect(sock, (struct sockaddr *)&sa, DEFAULT_TIMEOUT); 864 if(ret == -1) { 865 cout << "nonb_connect() in CreateSocket" << endl; 866 close(sock); 867 return -1; 868 } 869 870 return sock;//返回套接字文件描述符 871 } 872 873 874 /* function:通过IO复用的方法读取网页头信息 875 success: return bytesRead[网页头信息的真实长度] 876 fail: return -1 877 */ 878 int CHttp::read_header(int sock, char *headerPtr) 879 { 880 fd_set rfds;//读文件描述符集合 881 struct timeval tv; 882 int bytesRead = 0, newlines = 0, ret, selectRet; 883 884 int flags; 885 886 flags=fcntl(sock,F_GETFL,0);//将sock套接子文件描述符设置为非阻塞方式 887 if(flags<0) 888 { 889 cout << "1.fcntl() in read_header()< 0" << endl; 890 return -1; 891 } 892 893 flags|=O_NONBLOCK; 894 if(fcntl(sock,F_SETFL,flags)<0) 895 { 896 cout << "2.fcntl() < 0 in read_header()" << endl; 897 return -1; 898 } 899 900 //挂一个while()循环来读取网页头信息 901 while(newlines != 2 && bytesRead != HEADER_BUF_SIZE-1) 902 { 903 FD_ZERO(&rfds);//清理读文件描述符集合 904 FD_SET(sock, &rfds);//将套接字文件描述符加到读文件描述符集合中 905 tv.tv_sec = timeout;//设置最长的等待时间 906 tv.tv_usec = 0; 907 908 if(timeout >= 0) 909 selectRet = select(sock+1, &rfds, NULL, NULL, &tv); 910 else //最一个参数设置为NULL,表示阻塞操作会一直等待,直到莫个监视的文件集合中的某个文件描述符符合返回条件 911 selectRet = select(sock+1, &rfds, NULL, NULL, NULL); 912 913 if(selectRet == 0 && timeout < 0) 914 { 915 cout << "selectRet == 0 && timeout < 0" << endl; 916 return -1; 917 } 918 else if(selectRet == -1) //select()出错 919 { 920 cout << "selectRet == 0 && timeout < 0 else" << endl; 921 return -1; 922 } 923 924 ret = read(sock, headerPtr, 1); 925 if(ret == -1) 926 { 927 cout << "!error: read() in read_header()" << endl; 928 return -1; 929 } 930 931 bytesRead++; 932 933 if(*headerPtr == '\r') 934 { /* Ignore CR */ 935 /* Basically do nothing special, just don't set newlines 936 * to 0 */ 937 headerPtr++; 938 continue; 939 } 940 else if(*headerPtr == '\n') /* LF is the separator */ 941 newlines++; 942 else 943 newlines = 0; 944 945 headerPtr++; 946 947 } 948 949 //headerPtr -= 3; /* Snip the trailing LF's */ 950 /* to be compatible with Tianwang format, we have to retain them*/ 951 headerPtr -= 2; 952 *headerPtr = '\0'; 953 //cout << "in it " << headerPtr << endl; 954 return bytesRead; 955 } 956 957 958 959 /* 960 function:被CreateSocket()调用,通过IO复用的方法连接目标服务器 961 success: return 0; 962 fail: return -1; 963 sockfd: 套接子文件描述符 964 sa: 服务器套接子地址结构 965 sec: 最长的等待时间 966 */ 967 int CHttp::nonb_connect(int sockfd,struct sockaddr* sa,int sec) 968 { 969 int flags; 970 int status; 971 fd_set mask;//写文件描述符集合 972 struct timeval timeout; 973 974 //set the socket as nonblocking 975 flags=fcntl(sockfd,F_GETFL,0);//将套接子文件描述符设置为非阻塞方式 976 977 if(flags<0) return -1; 978 flags|=O_NONBLOCK;//设置非阻塞方式 979 if(fcntl(sockfd,F_SETFL,flags) < 0) 980 { 981 cout << "1.fcntl() in nonb_connect" << endl; 982 return -1; 983 } 984 985 if( connect(sockfd,sa,sizeof(struct sockaddr)) == 0)//立刻连接上了 986 { 987 flags&=~O_NONBLOCK;//因为上面已经设置了非阻塞方式,所以我们这里有必要重新设置阻塞方式--相当于复位 988 fcntl(sockfd,F_SETFL,flags); 989 return sockfd;//connected immediately 990 } 991 992 FD_ZERO(&mask);//清理写文件描述符集合mask 993 FD_SET(sockfd,&mask);//将sockfd套接字文件描述符加到文件描述符集合mask中 994 timeout.tv_sec=sec;//设置最长的等待时间 995 timeout.tv_usec=0; 996 status=select(sockfd+1,NULL,&mask,NULL,&timeout);//IO复用 997 998 switch(status){ 999 case -1: // Select error, set the socket as default blocking //select()出错 1000 flags&=~O_NONBLOCK; 1001 fcntl(sockfd,F_SETFL,flags); 1002 cout << "2.fcntl() in nonb_connect" << endl; 1003 return -1; 1004 case 0: //Connection timed out.//连接超时 1005 flags&=~O_NONBLOCK; 1006 fcntl(sockfd,F_SETFL,flags); 1007 cout << "3.fcntl() in nonb_connect" << endl; 1008 return -1; 1009 default: // Connected successfully.//连接成功 1010 FD_CLR(sockfd,&mask); 1011 flags&=~O_NONBLOCK; 1012 fcntl(sockfd,F_SETFL,flags); 1013 return 0; 1014 } 1015 } 1016 1017 /* 1018 function: 检测*buf所指的内存空间剩余值是否大于more,不过再加more+1单位的内存空间 1019 success: return 0; 1020 fail: return -1; 1021 */ 1022 int CHttp::checkBufSize(char **buf, int *bufsize, int more) 1023 { 1024 char *tmp; 1025 int roomLeft = *bufsize - (strlen(*buf) + 1);//*buf内存空间的剩余值 1026 1027 if(roomLeft > more) return 0;//剩余值大于more返回0 1028 1029 //pthread_mutex_lock(&mutexMemory); 1030 tmp = (char *)realloc(*buf, *bufsize + more + 1);//剩余值不够more,这个时候我们要调整内存空间的长度,长度加more+1 1031 //pthread_mutex_unlock(&mutexMemory); 1032 if(tmp == NULL) return -1;//没有调整成功返回-1 1033 1034 *buf = tmp; 1035 *bufsize += more + 1; 1036 return 0;//调整成功 1037 }