CHttp

 1 #ifndef _HTTP_H_031105_
 2 #define _HTTP_H_031105_
 3 
 4 #include <map>
 5 
 6 using namespace std;
 7 
 8 class CHttp
 9 {
10 private:
11     string m_strUrl;    // url
12     int *m_sock;        // socket
13 
14 public:
15     CHttp();
16     virtual ~CHttp();
17 
18 
19     //strUrl:  待抓取的网页对应的URL
20     //fileBuf: 网页体信息
21     //fileHead:网页头信息
22     //location:网页如果重定向对应的URL
23     //sock:套接子文件描述符
24     int Fetch(string strUrl, char **fileBuf, 
25         char **fileHead, char **location, int* sock);
26 
27 private:
28     //下面4个私有的成员函数--被 Fetch()函数调用
29     //通过IO复用的方法读取网页头信息
30     int read_header(int sock, char *headerPtr);
31     
32     //创建套接字文件描述符
33     int CreateSocket(const char *host, int port);
34 
35     //被CreateSocket()调用,通过IO复用的方法连接目标服务器
36     int nonb_connect(int, struct sockaddr*, int);
37 
38     //检测*buf所指的内存空间剩余值是否大于more,不够再加more+1单位的内存空间
39     int checkBufSize(char **buf, int *bufsize, int more);
40 
41 };
42 
43 extern pthread_mutex_t mutexMemory;
44 
45 #endif /* _HTTP_H_031105_ */
   1 #include <stdlib.h>
   2 #include <stdio.h>
   3 #include <string.h>
   4 #include <strings.h>
   5 #include <errno.h>
   6 #include <netdb.h>
   7 #include <unistd.h>
   8 #include <netinet/in.h>
   9 #include <sys/types.h>
  10 #include <sys/socket.h>
  11 #include <sys/time.h>
  12 #include <fcntl.h>
  13 #include <iostream>
  14 #include "Http.h"
  15 
  16 //#include "Tse.h"
  17 #include "CommonDef.h"
  18 #include "Url.h"
  19 //#include "Page.h"
  20 #include "StrFun.h"
  21 
  22 char *userAgent = NULL;
  23 int timeout = DEFAULT_TIMEOUT;//设置最长的等待时间30秒
  24 int hideUserAgent = 0;
  25 
  26 CHttp::CHttp()
  27 {
  28 }
  29 
  30 CHttp::~CHttp()
  31 {
  32 }
  33 
  34 
  35     /*
  36          * Actually downloads the page, registering a hit (donation)
  37          *      If the fileBuf passed in is NULL, the url is downloaded and then
  38          *      freed; otherwise the necessary space is allocated for fileBuf.
  39          *      Returns size of download on success, 
  40             -1 on error is set,
  41              -2 out of ip block,
  42              -3 invalid host,
  43             -4 MIME is imag/xxx
  44              -300 on 301.
  45          */
  46 
  47 
  48 
  49 /*
  50 
  51 function:
  52 
  53 success: return bytesRead[网页体信息的真实的字节数]
  54 
  55 fail:    return -1  各种其他的错误
  56 
  57          return -2  在IP阻塞范围内
  58 
  59          return -3  无效的主机号
  60 
  61          return -4  image/text类型
  62 
  63          return -300 网页重定向
  64 
  65 strUrl:  待抓取的网页对应的URL
  66 
  67 fileBuf: 网页体信息
  68 
  69 fileHead:网页头信息
  70 
  71 location:网页如果重定向对应的URL
  72 
  73 sock:套接子文件描述符
  74 
  75 */
  76 int CHttp::Fetch(string strUrl, char **fileBuf, char **fileHeadBuf, char **location, int* nPSock )
  77 {
  78     char *tmp, *url, *requestBuf, *pageBuf;
  79     const char *host, *path;
  80     int sock, bytesRead = 0, bufsize = REQUEST_BUF_SIZE;
  81     int ret = -1, tempSize, selectRet;
  82     int port = 80;
  83 
  84 
  85     if( strUrl.empty() )//空的URL肯定不能抓取到网页
  86     {
  87         cout << "strUrl is NULL" << endl;
  88         return -1;
  89     }
  90 
  91     /* Copy the url passed in into a buffer we can work with, change, etc. */
  92 /*
  93     url = (char*)malloc(strUrl.length()+1);
  94     if( url == NULL ){
  95         cout << "can not allocate enought memory for url" << endl;
  96         return -1;
  97     } else {
  98         memset(url, 0,strUrl.length()+1);
  99         memcpy(url, strUrl.c_str(), strUrl.length() );
 100     }
 101 */
 102     //pthread_mutex_lock(&mutexMemory);
 103     url = strdup(strUrl.c_str());//复制url
 104     //pthread_mutex_unlock(&mutexMemory);
 105     if( url == NULL )//分配失败
 106     {
 107         cout << "!error: stdup() in Fetch()" << endl;
 108         return -1;
 109     }
 110 
 111     // parse the url
 112     CUrl u;
 113     if( u.ParseUrlEx(url) == false )
 114     {
 115         //如果没有"http://"协议号,肯定会解析错误
 116         cout << "ParseUrlEx error in Fetch(): " << strUrl << endl;
 117         return -1;
 118     }
 119 
 120     host = u.m_sHost.c_str();
 121     path = u.m_sPath.c_str();
 122     if( u.m_nPort > 0 ) port = u.m_nPort;
 123 
 124     /* Compose a request string */
 125     //pthread_mutex_lock(&mutexMemory);
 126 
 127     /*构造HTTP请求报文:  假设strUrl="http://www.baidu.com/ecjtu/nihao.html"*/
 128     // GET /ecjtu/nihao.html HTTP/1.0\r\n
 129     requestBuf = (char*)malloc(bufsize);
 130     //pthread_mutex_unlock(&mutexMemory);
 131     if(requestBuf == NULL)
 132     {
 133         if (url)
 134         {
 135             //pthread_mutex_lock(&mutexMemory);
 136             free(url);
 137             url=NULL;
 138             //pthread_mutex_unlock(&mutexMemory);
 139         }
 140         cout << "can not allocate enought memory for requestBuf" << endl;
 141         return -1;
 142     }
 143     requestBuf[0] = 0;
 144 
 145     if( strlen(path) < 1 )//说明请求的是根目录下的网页
 146     {
 147         // GET / HTTP/1.0\r\n
 148         /* The url has no '/' in it, assume the user is making a root-level
 149                  *      request */
 150         tempSize = strlen("GET /") + strlen(HTTP_VERSION) +2;
 151 /*
 152         if( tempSize > bufsize ){
 153             free(url);
 154             free(requestBuf);
 155             cout << "tempSize larger than bufsize" << endl;
 156             return -1;
 157         }
 158 */
 159 
 160         if(checkBufSize(&requestBuf, &bufsize, tempSize) ||    snprintf(requestBuf, bufsize, "GET / %s\r\n", HTTP_VERSION) < 0 ){
 161             /*int snprintf(char *restrict buf, size_t n, const char * restrict  format, ...);
 162              函数说明:最多从源串中拷贝n-1个字符到目标串中,然后再在后面加一个0。所以如果目标串的大小为n
 163              的话,将不会溢出。*/
 164 
 165             //pthread_mutex_lock(&mutexMemory);
 166             if (url)
 167             {
 168                  free(url); 
 169                  url=NULL;
 170             }
 171             if (requestBuf)
 172             {
 173                  free(requestBuf); 
 174                  requestBuf=NULL;
 175             }
 176             //pthread_mutex_unlock(&mutexMemory);
 177             cout << "1.checkBuffSize(&requestBuf..) error" << endl;
 178             return -1;
 179         }
 180 
 181     }
 182     else//说明请求的是非根目录下的网页
 183     {
 184         tempSize = strlen("GET ") + strlen(path) + strlen(HTTP_VERSION) + 4;
 185 
 186         if(checkBufSize(&requestBuf, &bufsize, tempSize) ||    snprintf(requestBuf, bufsize, "GET %s %s\r\n", path, HTTP_VERSION) < 0)
 187         {
 188 
 189             //pthread_mutex_lock(&mutexMemory);
 190             if (url)
 191             {
 192                  free(url); 
 193                  url=NULL;
 194             }
 195             if (requestBuf)
 196             {
 197                  free(requestBuf); 
 198                  requestBuf=NULL;
 199             }
 200             //pthread_mutex_unlock(&mutexMemory);
 201             cout << "2._checkBuffSize(&requestBuf..) error" << endl;
 202             return -1;
 203         }
 204 
 205     }
 206 
 207 
 208     /* Use Host: even though 1.0 doesn't specify it.  Some servers
 209          *      won't play nice if we don't send Host, and it shouldn't hurt anything */
 210     tempSize = (int)strlen("Host: ") + (int)strlen(host) + 3;/* +3 for "\r\n\0" */
 211 
 212     if(checkBufSize(&requestBuf, &bufsize, tempSize + 128)){
 213         //pthread_mutex_lock(&mutexMemory);
 214         if (url)
 215         {
 216              free(url); url=NULL;
 217         }
 218         if (requestBuf)
 219         {
 220              free(requestBuf); requestBuf=NULL;
 221         }
 222         //pthread_mutex_unlock(&mutexMemory);
 223         cout << "3._checkBuffSize(&requestBuf..) error" << endl;
 224         return -1;
 225     }
 226 
 227     strcat(requestBuf, "Host: ");
 228     strcat(requestBuf, host);
 229     strcat(requestBuf, "\r\n");
 230 
 231     if(!hideUserAgent && userAgent == NULL) {
 232 
 233         tempSize = (int)strlen("User-Agent: ") +
 234             (int)strlen(DEFAULT_USER_AGENT) + (int)strlen(VERSION) + 4;
 235         if(checkBufSize(&requestBuf, &bufsize, tempSize)) {
 236             //pthread_mutex_lock(&mutexMemory);
 237             if (url)
 238             {
 239                  free(url); url=NULL;
 240             }
 241             if (requestBuf)
 242             {
 243                  free(requestBuf); requestBuf=NULL;
 244             }
 245             //pthread_mutex_unlock(&mutexMemory);
 246             cout << "4._checkBuffSize(&requestBuf..) error" << endl;
 247             return -1;
 248         }
 249         strcat(requestBuf, "User-Agent: ");
 250         strcat(requestBuf, DEFAULT_USER_AGENT);
 251         strcat(requestBuf, "/");
 252         strcat(requestBuf, VERSION);
 253         strcat(requestBuf, "\r\n");
 254 
 255     } else if(!hideUserAgent) {
 256 
 257         tempSize = (int)strlen("User-Agent: ") + (int)strlen(userAgent) + 3;
 258         if(checkBufSize(&requestBuf, &bufsize, tempSize)) {
 259 
 260             //pthread_mutex_lock(&mutexMemory);
 261             if (url)
 262             {
 263                  free(url); url=NULL;
 264             }
 265             if (requestBuf)
 266             {
 267                  free(requestBuf); requestBuf=NULL;
 268             }
 269             //pthread_mutex_unlock(&mutexMemory);
 270             cout << "5._checkBuffSize(&requestBuf..) error" << endl;
 271             return -1;
 272         }
 273         strcat(requestBuf, "User-Agent: ");
 274         strcat(requestBuf, userAgent);
 275         strcat(requestBuf, "\r\n");
 276     }
 277 
 278     //tempSize = (int)strlen("Connection: Close\n\n");
 279     tempSize = (int)strlen("Connection: Keep-Alive\r\n\r\n");
 280     if(checkBufSize(&requestBuf, &bufsize, tempSize)) {
 281         //pthread_mutex_lock(&mutexMemory);
 282         if (url)
 283         {
 284              free(url); url=NULL;
 285         }
 286         if (requestBuf)
 287         {
 288              free(requestBuf); requestBuf=NULL;
 289         }
 290         //pthread_mutex_unlock(&mutexMemory);
 291         cout << "6._checkBuffSize(&requestBuf..) error" << endl;
 292         return -1;
 293     }
 294 
 295 
 296     //strcat(requestBuf, "Connection: Close\n\n");
 297     strcat(requestBuf, "Connection: Keep-Alive\r\n\r\n");
 298 
 299 
 300     /* Now free any excess memory allocated to the buffer */
 301     //pthread_mutex_lock(&mutexMemory);
 302     //重新调整requestBuf的内存空间,释放多余的内存空间
 303     tmp = (char *)realloc(requestBuf, strlen(requestBuf) + 1);
 304     //pthread_mutex_unlock(&mutexMemory);
 305     if(tmp == NULL){
 306         //pthread_mutex_lock(&mutexMemory);
 307         if (url)
 308         {
 309              free(url); url=NULL;
 310         }
 311         if (requestBuf)
 312         {
 313              free(requestBuf); requestBuf=NULL;
 314         }
 315         //pthread_mutex_unlock(&mutexMemory);
 316         cout << "realloc for tmp error" << endl;
 317         return -1;
 318     }
 319     requestBuf = tmp;
 320 
 321     if( *nPSock != -1 ){
 322         sock = *nPSock;
 323         cout << "using privous socket " << *nPSock << endl;
 324     }else{
 325 
 326         // cout << "1.get a new one" << endl;
 327         sock = CreateSocket( host, port );
 328         if(sock == -1) { // invalid host
 329             //pthread_mutex_lock(&mutexMemory);
 330             if (url)
 331             {
 332                  free(url); url=NULL;
 333             }
 334             if (requestBuf)
 335             {
 336                  free(requestBuf); requestBuf=NULL;
 337             }
 338             //pthread_mutex_unlock(&mutexMemory);
 339             return -3;
 340         }
 341         if(sock == -2) { // out of ip block
 342             //pthread_mutex_lock(&mutexMemory);
 343             if (url)
 344             {
 345                  free(url); url=NULL;
 346             }
 347             if (requestBuf)
 348             {
 349                  free(requestBuf); requestBuf=NULL;
 350             }
 351             //pthread_mutex_unlock(&mutexMemory);
 352             //cout << "2.not able to MakeSocket" << endl;
 353             return -2;
 354         }
 355     }
 356     
 357 
 358 
 359     ret = write(sock, requestBuf, strlen(requestBuf));
 360     if( ret == 0 ){
 361         cout << "requestBuf is " << requestBuf << endl;
 362         cout << "write nothing" << endl;
 363         //pthread_mutex_lock(&mutexMemory);
 364         if (url)
 365         {
 366             free(url); url=NULL;
 367         }
 368         if (requestBuf)
 369         {
 370             free(requestBuf); requestBuf=NULL;
 371         }
 372         //pthread_mutex_unlock(&mutexMemory);
 373         close(sock);
 374         *nPSock = -1;
 375         return -1;
 376         
 377     }
 378     if( ret == -1){
 379         //cout << "write error" << endl;
 380         // sock is invalid,we should make a new one
 381         close(sock);
 382         *nPSock  = -1;
 383 
 384         cout << "2.close previous socket " << *nPSock << " and get a new one" << endl;
 385         //maybe sock is dead,try again
 386         sock = CreateSocket( host, port );
 387         if(sock == -1) { 
 388             //pthread_mutex_lock(&mutexMemory);
 389             if (url)
 390             {
 391                 free(url); url=NULL;
 392             }
 393             if (requestBuf)
 394             {
 395                 free(requestBuf); requestBuf=NULL;
 396             }
 397             //pthread_mutex_unlock(&mutexMemory);
 398             cout << "3.not able to MakeSocket" << endl;
 399             return -1;
 400         }
 401         if(sock == -2) { 
 402             //pthread_mutex_lock(&mutexMemory);
 403             if (url)
 404             {
 405                 free(url); url=NULL;
 406             }
 407             if (requestBuf)
 408             {
 409                 free(requestBuf); requestBuf=NULL;
 410             }
 411             //pthread_mutex_unlock(&mutexMemory);
 412             cout << "4.not able to MakeSocket" << endl;
 413             return -1;
 414         }
 415         if(write(sock, requestBuf, strlen(requestBuf)) == -1){
 416             //pthread_mutex_lock(&mutexMemory);
 417             if (url)
 418             {
 419                 free(url); url=NULL;
 420             }
 421             if (requestBuf)
 422             {
 423                 free(requestBuf); requestBuf=NULL;
 424             }
 425             //pthread_mutex_unlock(&mutexMemory);
 426             close(sock);
 427             *nPSock = -1;
 428             cout << "write error" << endl;
 429             return -1;
 430         }
 431     }
 432 
 433     //pthread_mutex_lock(&mutexMemory);
 434     if (url)
 435     {
 436         free(url); url=NULL;
 437     }
 438     if (requestBuf)
 439     {
 440         free(requestBuf); requestBuf=NULL;
 441     }
 442     //pthread_mutex_unlock(&mutexMemory);
 443 
 444 
 445     char headerBuf[HEADER_BUF_SIZE];
 446     /* Grab enough of the response to get the metadata */
 447     memset( headerBuf,0,HEADER_BUF_SIZE );
 448     //cout << "old sock is " << sock << endl;
 449     ret = read_header(sock, headerBuf);
 450     //cout << "ret = " << ret << endl;
 451     if(ret < 0) { 
 452         close(sock); 
 453         *nPSock = -1;
 454         return -1;
 455     }
 456 
 457     //cout << headerBuf << endl;
 458     if( strlen(headerBuf) == 0 ){
 459         cout << "strlen(headerBuf) = 0" << headerBuf << endl;
 460         cout << "strUrl: " << strUrl << endl << endl;;
 461         close(sock);
 462                 *nPSock = -1;
 463         return -1;
 464     }
 465 
 466 
 467 
 468      //解析网页头信息
 469     CPage iPage;
 470     iPage.ParseHeaderInfo(headerBuf);
 471     if (iPage.m_nStatusCode == -1)
 472     {
 473         close(sock);
 474         *nPSock = -1;
 475         cout << "headerBuf: " << headerBuf << endl;
 476         cout << "!header error: not find HTTP" << endl;
 477         return -1;
 478     }
 479 
 480 
 481 
 482     // deal with http://net.cs.pku.edu.cn/~cnds
 483     if (iPage.m_nStatusCode == 301 || iPage.m_nStatusCode == 302)
 484     {
 485         if (iPage.m_sLocation.empty() || iPage.m_sLocation.size()>URL_LEN)
 486         {    
 487             close(sock);
 488             *nPSock = -1;
 489             cout << headerBuf << endl;
 490             cout << "!error: Location" << endl;
 491             return -1;
 492         }
 493         else
 494         {
 495             //pthread_mutex_lock(&mutexMemory);
 496             char *loc=strdup(iPage.m_sLocation.c_str());
 497             //pthread_mutex_unlock(&mutexMemory);
 498             *location = loc;
 499             close(sock);
 500             *nPSock = -1;
 501             return -300;//重定向了
 502         }
 503     }
 504 
 505     if(iPage.m_nStatusCode<200 || iPage.m_nStatusCode>299 ){
 506         close(sock);
 507         *nPSock = -1;
 508         cout << "!header code = " << iPage.m_nStatusCode << endl;
 509         return -1;
 510     }
 511 
 512     // when crawling images for ImgSE, remember to comment the paragraph
 513     // when crawling plain text for SE, remember to open the paragraph
 514     // paragraph begin
 515     if( iPage.m_sContentType.find("image") != string::npos )
 516     { // 
 517         close(sock);
 518         *nPSock = -1;
 519         return -4;
 520     }
 521     // paragraph end
 522 
 523     if (iPage.m_nContentLength == -1)
 524     {
 525         close(sock);
 526         *nPSock = -1;
 527         cout << headerBuf << endl;
 528         cout << "!error: Content-length" << endl;
 529         return -1;
 530     }
 531 
 532     if (iPage.m_nContentLength==0 || iPage.m_nContentLength<20)
 533     { // Allocate enough memory to hold the page 
 534         iPage.m_nContentLength = DEFAULT_PAGE_BUF_SIZE;
 535     }
 536 
 537 
 538     if (iPage.m_nContentLength > MAX_PAGE_BUF_SIZE)
 539     {
 540          cout<<"这个网页的长度大于5M,我过滤掉它!"<<endl;
 541         cout << "the page discarded due to its size " 
 542             << iPage.m_nContentLength 
 543             << " is larger than " << MAX_PAGE_BUF_SIZE << endl;
 544         close(sock);
 545         *nPSock = -1;
 546         return -1;
 547     }
 548 
 549     //pthread_mutex_lock(&mutexMemory);
 550     pageBuf = (char *)malloc(iPage.m_nContentLength);
 551     //pthread_mutex_unlock(&mutexMemory);
 552     if(pageBuf == NULL){
 553         close(sock);
 554         *nPSock = -1;
 555         cout << "malloc for pageBuf" << endl;
 556         return -1;
 557     }
 558     
 559     /* Begin reading the body of the file */
 560     //开始读取网页体信息
 561     fd_set rfds;
 562     struct timeval tv;
 563     int flags;
 564     //将sock套接子文件描述符设置为非阻塞的方式
 565     flags=fcntl(sock,F_GETFL,0);
 566     if(flags<0)
 567     {
 568         close(sock);
 569         *nPSock = -1;
 570         if (pageBuf)
 571         {
 572             //pthread_mutex_lock(&mutexMemory);
 573             free(pageBuf);
 574             pageBuf=NULL;
 575             //pthread_mutex_unlock(&mutexMemory);
 576         }
 577         cout << "1.fcntl() error " << endl;
 578         return -1;
 579     }
 580     
 581     
 582     flags|=O_NONBLOCK;
 583     if(fcntl(sock,F_SETFL,flags)<0){
 584         close(sock);
 585         *nPSock = -1;
 586         if (pageBuf)
 587         {
 588             free(pageBuf); pageBuf=NULL;
 589         }
 590         cout << "2.fcntl() error " << endl;
 591         return -1;
 592     }
 593 
 594 
 595     //挂一个while()循环读取网页体信息
 596     int pre_ret=0;
 597     while(ret > 0)
 598     {
 599         FD_ZERO(&rfds);//清理rfds读文件描述符集合
 600         FD_SET(sock, &rfds);//将sock加到rfds读文件描述符集合中
 601         if( bytesRead == iPage.m_nContentLength )
 602         {
 603             tv.tv_sec = 1;
 604         }
 605         else
 606         {
 607             tv.tv_sec = timeout;
 608         }
 609         tv.tv_usec = 0;
 610 
 611         if(DEFAULT_TIMEOUT >= 0)
 612             selectRet = select(sock+1, &rfds, NULL, NULL, &tv);//IO复用
 613         else            /* No timeout, can block indefinately */
 614             selectRet = select(sock+1, &rfds, NULL, NULL, NULL);
 615 
 616         if(selectRet == 0 && timeout < 0)//超时
 617         {
 618             close(sock);
 619             *nPSock = -1;
 620             if (pageBuf)
 621             {
 622                 //pthread_mutex_lock(&mutexMemory);
 623                 free(pageBuf);
 624                 pageBuf=NULL;
 625                 //pthread_mutex_unlock(&mutexMemory);
 626             }
 627             cout << "selectRet == 0 && timeout < 0" << endl;
 628             return -1;
 629         } 
 630         else if(selectRet == -1)//select()函数出错
 631         {
 632             close(sock);
 633             *nPSock = -1;
 634             if (pageBuf)
 635             {
 636                 free(pageBuf);
 637                 pageBuf=NULL;
 638             }
 639             cout << "selectRet == -1" << endl;
 640             return -1;
 641         }
 642 
 643         //每次最多接收iPage.m_nContentLength字节--缓冲区的大小为iPage.m_nContentLength
 644         ret = read(sock, pageBuf + bytesRead, iPage.m_nContentLength);
 645         //ret = read(sock, (char*)pageBuf.c_str() + bytesRead, iPage.m_nContentLength);
 646 
 647         if(ret == 0) break;
 648         if(ret == -1 && pre_ret==0)//read()函数出错
 649         {
 650             close(sock);
 651             *nPSock = -1;
 652             if (pageBuf)
 653             {
 654                 //pthread_mutex_lock(&mutexMemory);
 655                 free(pageBuf); pageBuf=NULL;
 656                 //pthread_mutex_unlock(&mutexMemory);
 657             }
 658             cout << "read()'s retval=-1" << endl;
 659             return -1;
 660         }
 661         else if( ret == -1 && pre_ret )
 662         {
 663             //cout << "2. pre_ret = " << pre_ret << endl;
 664 /*
 665             if( bytesRead < iPage.m_nContentLength){    // meaning we lost the connection too soon
 666                 cout << "lost the connection too soon" << endl;
 667                 freeOpageBuf);
 668                 return -1;
 669             }
 670 */
 671             break;
 672         }
 673 
 674         pre_ret = ret;
 675         //cout << "1.pre_ret = " << pre_ret << endl;
 676 
 677         bytesRead += ret;
 678 
 679 
 680             /* To be tolerant of inaccurate Content-Length fields, we'll
 681              *      allocate another read-sized chunk to make sure we have
 682              *      enough room.
 683              */
 684         if(ret > 0) {
 685             //pthread_mutex_lock(&mutexMemory);
 686             pageBuf = (char *)realloc(pageBuf, bytesRead + iPage.m_nContentLength);
 687             //pthread_mutex_unlock(&mutexMemory);
 688             if(pageBuf == NULL) {
 689                 close(sock);
 690                 *nPSock = -1;
 691                 if (pageBuf)
 692                 {
 693                     //pthread_mutex_lock(&mutexMemory);
 694                     free(pageBuf); pageBuf=NULL;
 695                     //pthread_mutex_unlock(&mutexMemory);
 696                 }
 697                 cout << "realloc()" << endl;
 698                 return -1;
 699             }
 700         }
 701 
 702     }
 703 
 704     /*
 705      * The download buffer is too large.  Trim off the safety padding.
 706     */
 707 
 708     //pthread_mutex_lock(&mutexMemory);
 709     pageBuf = (char *)realloc(pageBuf, bytesRead+1);
 710     //pthread_mutex_unlock(&mutexMemory);
 711     if(pageBuf == NULL){
 712         close(sock);
 713         *nPSock = -1;
 714         if (pageBuf)
 715         {
 716             //pthread_mutex_lock(&mutexMemory);
 717             free(pageBuf); pageBuf=NULL;
 718             //pthread_mutex_unlock(&mutexMemory);
 719         }
 720         cout << "2.realloc()" << endl;
 721         return -1;
 722     }
 723 
 724 
 725     pageBuf[bytesRead] = '\0';
 726 
 727 
 728     if(fileBuf == NULL){    /* They just wanted us to "hit" the url */
 729         if (pageBuf)
 730         {
 731             //pthread_mutex_lock(&mutexMemory);
 732             free(pageBuf); pageBuf=NULL;
 733             //pthread_mutex_unlock(&mutexMemory);
 734         }
 735     }else{
 736 
 737 
 738 
 739         char *tmp;
 740         //tmp = (char *)malloc(HEADER_BUF_SIZE);
 741         //pthread_mutex_lock(&mutexMemory);
 742         tmp = (char *)malloc(strlen(headerBuf)+1);
 743         //pthread_mutex_unlock(&mutexMemory);
 744             if(tmp == NULL){
 745                     close(sock);
 746             *nPSock = -1;
 747             if (pageBuf)
 748             {
 749                 //pthread_mutex_lock(&mutexMemory);
 750                 free(pageBuf); pageBuf=NULL;
 751                 //pthread_mutex_unlock(&mutexMemory);
 752             }
 753             cout << "malloc() for headerBuf" << endl;
 754                     return -1;
 755             }
 756         //memcpy( tmp, headerBuf, HEADER_BUF_SIZE-1 );
 757         strncpy( tmp, headerBuf, strlen(headerBuf)+1 );
 758         *fileHeadBuf = tmp;
 759 
 760         *fileBuf = pageBuf;
 761     }
 762         
 763     //close(sock);
 764     *nPSock = sock;
 765     return bytesRead;
 766 }
 767     
 768 
 769 
 770 
 771 
 772 /*
 773 
 774 function: 创建套接字文件描述符,并且调用nonb_connect()同目标服务器进行连接
 775 
 776 success:  return sock[成功创建的套接子文件描述符]
 777 
 778 fail:     return -1  其他错误
 779 
 780           return -2  在IP阻塞范围内
 781 
 782 */
 783 int CHttp::CreateSocket(const char *host, int port)
 784 {
 785     int sock;        // Socket descriptor
 786     struct sockaddr_in sa;    // Socket address
 787 
 788 
 789     unsigned long   inaddr;
 790     int ret;
 791 
 792     CUrl url;
 793     char *ip = url.GetIpByHost(host);//通过主机号得到IP地址
 794 
 795     if( ip == NULL )//获得失败
 796     { // gethostbyname() error in GetIpByHost()
 797         //cout << "invalid host: " << host << endl;
 798         return -1;
 799 
 800     } 
 801     else 
 802     {
 803         // filter ip (decide whether it is inside the ip block)
 804         if( url.IsValidIp(ip) )//在IP阻塞范围内
 805         {
 806             // inside
 807             inaddr = (unsigned long)inet_addr(ip);//将字符串IP转化为32位的网络字节序
 808 
 809             if( inaddr == INADDR_NONE )
 810             {
 811                 // release the buffer, be careful
 812                 //pthread_mutex_lock(&mutexMemory);
 813                 delete [] ip; ip = NULL;
 814                 //pthread_mutex_unlock(&mutexMemory);
 815                 cout << "invalid ip " << ip << endl;
 816                 return -1;
 817             }
 818 
 819             memcpy((char *)&sa.sin_addr, (char *)&inaddr, sizeof(inaddr));
 820 
 821             // release the buffer, be carful
 822             //pthread_mutex_lock(&mutexMemory);
 823             delete [] ip; ip = NULL;
 824             //pthread_mutex_unlock(&mutexMemory);
 825 
 826         } 
 827         else//在IP阻塞范围外
 828         { // out of ip block
 829             // release the buffer, be carful
 830             //pthread_mutex_lock(&mutexMemory);
 831             delete [] ip; ip = NULL;
 832             //pthread_mutex_unlock(&mutexMemory);
 833             //cout << "out of ip block: " << host << endl;
 834             return -2;
 835         }
 836     }
 837 
 838 
 839     /* Copy host address from hostent to (server) socket address */
 840     sa.sin_family = AF_INET;        
 841     sa.sin_port = htons(port);    /* Put portnum into sockaddr */
 842 
 843     sock = -1;
 844     sock = socket(AF_INET, SOCK_STREAM, 0);//创建套接字文件描述符
 845     if(sock < 0 ) //创建失败
 846     { 
 847         cout << "socket() in CreateSocket" << endl;
 848         return -1;
 849     }
 850 
 851     int optval = 1;
 852     if (setsockopt (sock, SOL_SOCKET, SO_REUSEADDR,(char *)&optval, sizeof (optval)) < 0)
 853         //SOL_SOCKET 通用套接字选项
 854         //SO_REUSEADDR 表示允许本地地址重用
 855     {
 856 
 857         cout << "setsockopt() in CreateSocket" << endl;
 858         close(sock);
 859         return -1;
 860     }
 861 
 862         //ret = connect(sock, (struct sockaddr *)&sa, sizeof(sa));
 863         ret = nonb_connect(sock, (struct sockaddr *)&sa, DEFAULT_TIMEOUT);
 864         if(ret == -1) { 
 865         cout << "nonb_connect() in CreateSocket" << endl;
 866         close(sock);
 867         return -1; 
 868     }
 869 
 870         return sock;//返回套接字文件描述符
 871 }
 872 
 873 
 874 /* function:通过IO复用的方法读取网页头信息
 875  success: return  bytesRead[网页头信息的真实长度]
 876  fail:    return  -1
 877  */
 878 int CHttp::read_header(int sock, char *headerPtr)
 879 {
 880     fd_set rfds;//读文件描述符集合
 881     struct timeval tv;
 882     int bytesRead = 0, newlines = 0, ret, selectRet;
 883 
 884     int flags;
 885 
 886     flags=fcntl(sock,F_GETFL,0);//将sock套接子文件描述符设置为非阻塞方式
 887     if(flags<0)
 888     {
 889         cout << "1.fcntl() in read_header()< 0" << endl;
 890         return -1;
 891     }
 892     
 893     flags|=O_NONBLOCK;
 894     if(fcntl(sock,F_SETFL,flags)<0)
 895     {
 896         cout << "2.fcntl() < 0 in read_header()" << endl;
 897         return -1;
 898     }
 899 
 900     //挂一个while()循环来读取网页头信息
 901     while(newlines != 2 && bytesRead != HEADER_BUF_SIZE-1)
 902     {
 903         FD_ZERO(&rfds);//清理读文件描述符集合
 904         FD_SET(sock, &rfds);//将套接字文件描述符加到读文件描述符集合中
 905         tv.tv_sec = timeout;//设置最长的等待时间
 906         tv.tv_usec = 0;
 907 
 908         if(timeout >= 0)
 909             selectRet = select(sock+1, &rfds, NULL, NULL, &tv);
 910         else   //最一个参数设置为NULL,表示阻塞操作会一直等待,直到莫个监视的文件集合中的某个文件描述符符合返回条件
 911             selectRet = select(sock+1, &rfds, NULL, NULL, NULL);
 912 
 913         if(selectRet == 0 && timeout < 0) 
 914         {
 915             cout << "selectRet == 0 && timeout < 0" << endl;
 916             return -1;
 917         }
 918         else if(selectRet == -1) //select()出错
 919         {
 920             cout << "selectRet == 0 && timeout < 0 else" << endl;
 921             return -1;
 922         }
 923 
 924         ret = read(sock, headerPtr, 1);
 925         if(ret == -1)
 926         {
 927             cout << "!error: read() in read_header()" << endl;
 928             return -1;
 929         }
 930 
 931         bytesRead++;
 932                 
 933         if(*headerPtr == '\r')
 934         {                 /* Ignore CR */
 935             /* Basically do nothing special, just don't set newlines
 936              *      to 0 */
 937             headerPtr++;
 938             continue;
 939         }
 940         else if(*headerPtr == '\n')             /* LF is the separator */
 941             newlines++;
 942         else    
 943             newlines = 0;
 944                 
 945         headerPtr++;
 946 
 947     }
 948         
 949     //headerPtr -= 3;         /* Snip the trailing LF's */
 950                   /* to be compatible with Tianwang format, we have to retain them*/
 951     headerPtr -= 2;
 952     *headerPtr = '\0';
 953     //cout << "in it " << headerPtr << endl;
 954     return bytesRead;
 955 }
 956 
 957 
 958 
 959 /*
 960  function:被CreateSocket()调用,通过IO复用的方法连接目标服务器
 961  success: return 0;
 962  fail:    return -1;
 963  sockfd:  套接子文件描述符
 964  sa:      服务器套接子地址结构
 965  sec:     最长的等待时间
 966  */
 967 int CHttp::nonb_connect(int sockfd,struct sockaddr* sa,int sec)
 968 {
 969     int flags;
 970     int status;
 971     fd_set mask;//写文件描述符集合
 972     struct timeval timeout;
 973 
 974     //set the socket as nonblocking
 975     flags=fcntl(sockfd,F_GETFL,0);//将套接子文件描述符设置为非阻塞方式
 976 
 977     if(flags<0) return -1;
 978     flags|=O_NONBLOCK;//设置非阻塞方式
 979     if(fcntl(sockfd,F_SETFL,flags) < 0)
 980     {
 981         cout << "1.fcntl() in nonb_connect" << endl;
 982         return -1;
 983     }
 984 
 985     if( connect(sockfd,sa,sizeof(struct sockaddr)) == 0)//立刻连接上了
 986     {
 987         flags&=~O_NONBLOCK;//因为上面已经设置了非阻塞方式,所以我们这里有必要重新设置阻塞方式--相当于复位
 988         fcntl(sockfd,F_SETFL,flags);
 989         return sockfd;//connected immediately
 990         }
 991 
 992     FD_ZERO(&mask);//清理写文件描述符集合mask
 993     FD_SET(sockfd,&mask);//将sockfd套接字文件描述符加到文件描述符集合mask中
 994     timeout.tv_sec=sec;//设置最长的等待时间
 995     timeout.tv_usec=0;
 996     status=select(sockfd+1,NULL,&mask,NULL,&timeout);//IO复用
 997 
 998     switch(status){
 999         case -1:        // Select error, set the socket as default blocking  //select()出错
1000             flags&=~O_NONBLOCK;
1001             fcntl(sockfd,F_SETFL,flags);
1002             cout << "2.fcntl() in nonb_connect" << endl;
1003             return -1;
1004         case 0:         //Connection timed out.//连接超时
1005             flags&=~O_NONBLOCK;
1006             fcntl(sockfd,F_SETFL,flags);
1007             cout << "3.fcntl() in nonb_connect" << endl;
1008             return -1;
1009         default:         // Connected successfully.//连接成功
1010             FD_CLR(sockfd,&mask);
1011             flags&=~O_NONBLOCK;
1012             fcntl(sockfd,F_SETFL,flags);
1013             return 0;
1014     }
1015 }
1016 
1017 /*
1018 function: 检测*buf所指的内存空间剩余值是否大于more,不过再加more+1单位的内存空间
1019 success:  return 0;
1020 fail:     return -1;
1021 */
1022 int CHttp::checkBufSize(char **buf, int *bufsize, int more)
1023 {
1024     char *tmp;
1025     int roomLeft = *bufsize - (strlen(*buf) + 1);//*buf内存空间的剩余值
1026 
1027     if(roomLeft > more) return 0;//剩余值大于more返回0
1028 
1029     //pthread_mutex_lock(&mutexMemory);
1030     tmp = (char *)realloc(*buf, *bufsize + more + 1);//剩余值不够more,这个时候我们要调整内存空间的长度,长度加more+1
1031     //pthread_mutex_unlock(&mutexMemory);
1032     if(tmp == NULL) return -1;//没有调整成功返回-1
1033 
1034     *buf = tmp;
1035     *bufsize += more + 1;
1036     return 0;//调整成功
1037 }
原文地址:https://www.cnblogs.com/kakamilan/p/2578889.html