Linux下socket实现网页抓取 Unicorn 博客频道 CSDN.NET

Linux下socket实现网页抓取 - Unicorn - 博客频道 - CSDN.NET


Linux下socket实现网页抓取


分类:
C/C++学习点滴
DO spiders DO
linux编程


951人阅读
评论(0)
收藏
举报

 主要用来和WinSock进行下比较:

--WinSock--

需要初始化:

if( (Ret = WSAStartup(MAKEWORD(1,1), &wsaData) ) != 0 )

{

 printf("WSAStartup failed with error %d/n", Ret);

 return FALSE;

}

头文件:

--WinSock--

#include <winsock2.h> //header

#pragma comment (lib, "ws2_32.lib") //lib

--Linux--

#include <sys/socket.h>

#include <netinet/in.h>

#include <arpa/inet.h>

#include <netdb.h>

各个头文件的作用还需要进一步研究

gethostbyname(host)://从主机名返回地址

这个都是一样的,返回一个struct hostent *的指针。

地址结构:

--WinSock--

SOCKADDR_IN

--Linux--

sockaddr_in

实际上是一样的都是

struct sockaddr_in{

   shortsin_family;

   unsigned short sin_port;

   struct in_addr sin_addr;

   charsin_zero[8];

};

(

  这个结构是sockaddr的等价结构

  struct sockaddr

  {

   unsigned short sa_family; // address family, AF_XXX

 char sa_data[14];   //14 bytes of protocol address

  };

)

其中IP地址结构struct in_addr定义如下:

struct   in_addr {

    union   {

         struct{

             unsigned  char   s_b1,

                              s_b2,

                              s_b3,

                              s_b4;

        }  S_un_b;

             struct  {

             unsigned  short  s_w1,

                              s_w2;

              }  S_un_w;

               unsigned long  S_addr;

     } S_un;

};

Socket:

--WinSock--

返回句柄SOCKET,就是socket描述符

--Linux--

比较直接返回int型socket描述符

函数接口都一样

函数例子:

socket (AF_INET, SOCK_STREAM, 0); //TCP

connect(sock, (const sockaddr * )&tcpaddr, sizeof(tcpaddr)); //返回值有不同

--WinSock--

If no error occurs, connect returns zero. Otherwise, it returns SOCKET_ERROR, and a specific error code can be retrieved by calling

WSAGetLastError.

--Linux--

错误返回-1

send(sock_description, message, strlen(message), 0); //返回值不同

--WinSock--

If no error occurs, send returns the total number of bytes sent, which can be less than the number indicated by len. Otherwise, a value of

SOCKET_ERROR is returned, and a specific error code can be retrieved by calling WSAGetLastError.

--Linux--

错误返回-1

recv(sock_description, buffer, sizeof(buffer), 0);//返回值不同

--WinSock--

If no error occurs, recv returns the number of bytes received. If the connection has been gracefully closed, the return value is zero. Otherwise, a

value of SOCKET_ERROR is returned, and a specific error code can be retrieved by calling WSAGetLastError.

--Linux--

错误返回-1

结束:

--WinSock--

closesocket(sock);

if( WSACleanup() == SOCKET_ERROR )

{

 printf("WSACleanup failed with error %d /n", WSAGetLastError() );

}

--Linux--

close(sock);

下面是一个Linux下socket一个HTTP协议GET方法的应用:

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <sys/socket.h>

#include <netinet/in.h>

#include <arpa/inet.h>

#include <netdb.h>

char* host = "www.hao123.com";

int port = 80;

int main(void)

{

 char buffer[512];

 int isock;

 struct sockaddr_in pin;

 struct hostent * remoteHost;

 char message[512];

 int done = 0;

 int chars = 0;

 int l = 0;

 if( (remoteHost = gethostbyname(host)) == 0 )

 {

  printf("Error resolving host/n");

  exit(1);

 }

 bzero(message,sizeof(message));

 bzero(&pin,sizeof(pin));

 pin.sin_family = AF_INET;

 pin.sin_port = htons(port);

 pin.sin_addr.s_addr = ( (struct in_addr *)(remoteHost->h_addr) )->s_addr;

 

 if( (isock = socket(AF_INET, SOCK_STREAM, 0)) == -1)

 {

  printf("Error opening socket!/n");

  exit(1);

 }

 sprintf(message, "GET / HTTP/1.1/r/n");

 strcat(message, "Host:www.hao123.com/r/n");

 strcat(message, "Accept: */*/r/n");

 strcat(message, "User-Agent: Mozilla/4.0(compatible)/r/n");

 strcat(message, "connection:Keep-Alive/r/n");

 strcat(message, "/r/n/r/n");

 printf("%s",message);

 if( connect(isock, (void *)&pin, sizeof(pin)) == -1 )

 {

  printf("Error connecting to socket/n");

  exit(1);

 }

 if( send(isock, message, strlen(message), 0) == -1)

 {

  printf("Error in send/n");

  exit(1);

 }

 

 while(done == 0)

 {

  l = recv(isock, buffer, 1, 0);

  if( l < 0 )

   done = 1;

  switch(*buffer)

  {

   case '/r':

    break;

   case '/n':

    if(chars == 0)

     done = 1;

    chars = 0;

    break;

   default:

    chars++;

    break;

  }

   printf("%c",*buffer);

 }

 do

 {

  l = recv(isock, buffer, sizeof(buffer) - 1, 0);

  if( l < 0 )

   break;

  *(buffer + l) = 0;

  fputs(buffer, stdout);

 }while( l > 0 );

 close(isock);

 return 0;

}

原文地址:https://www.cnblogs.com/lexus/p/2616303.html