用C++爬取网页

          做了好几天,终于写出来了,以前没有想到过,用C++也可以爬取网页,经过这么多天的努力终于做好了,解决了乱码问题。

从中学到很多,小到一个函数的参数,达到如何使用一个函数。

           还有C++中一直让人头疼的编码问题,unicode编码问题,研究了很多资料,又对Mutibytetowidechar和widechartomultibyte进行了重新的认识。

一个重要的关键是windows默认的是ANSI字符集,同时对HTML的格式进行了分析,以判断编码问题。

           感觉那么多天的辛苦没有白费,付出有了收获。不过在此,真的感谢那些牛人,期间也参考了他们的代码。

 代码:

#include <iostream>
#include <winsock2.h>
#include <cstring>
#include <fstream>
#pragma comment(lib,"ws2_32.lib")

using namespace std;

void getWebPage(char *url)
{
	SOCKET sock;
	WSADATA wsa;
	struct sockaddr_in  addrclient;
	ofstream of;
	WSAStartup(MAKEWORD(2,2),&wsa);
	of.open("temp.txt");
	if(!of)
	{
		cout<<"open fail!"<<endl;
		return;
	}
	static char content[100000]="";
	char myurl[256];
	char host[256];
	char dom[256];
	char header[256];
	char type[512];
	char *p;
	memset(myurl,'\0',256);
	memset(host,'\0',256);
	memset(dom,'\0',256);
	memset(header,'\0',256);
	memset(type,'\0',512);
	char *purl=0;
	struct hostent *phost;
	sock=socket(PF_INET,SOCK_STREAM,IPPROTO_TCP);

	strcpy(myurl,url);
	for(purl=myurl;*purl!='/'&&purl!='\0';++purl);
	if(int(purl-myurl)==strlen(myurl))
		strcpy(host,"/");
	else
		strcpy(host,purl);
	*purl='\0';
	strcpy(dom,myurl);

	cout<<dom<<endl;          //输出域名
	cout<<host<<endl;     //输出地址
	of<<dom<<endl;
	of<<host<<endl;
	phost=gethostbyname(dom);
		
	addrclient.sin_family=AF_INET;
	addrclient.sin_port=htons(80);
	addrclient.sin_addr.S_un.S_addr=*((unsigned long *)phost->h_addr);
	
	connect(sock,(struct sockaddr*)&addrclient,sizeof(addrclient));
	
	strcat(header, "GET "); 
    strcat(header, host); 
    strcat(header, " HTTP/1.1\r\n"); 
    strcat(header, "Host: "); 
    strcat(header, dom); 
    strcat(header, "\r\nConnection: Close\r\n\r\n"); 
	send(sock,header,strlen(header),0);
	recv(sock,type,512,0);
	cout<<type<<endl;
	of<<type;
	p=strstr(type,"utf-8");
	if(p)
	{
	memset(content,'\0',100000);
	while(recv(sock,content,100000,0)>0)
	{
		int len=MultiByteToWideChar(CP_UTF8, 0, content, -1, NULL,0);
        unsigned short * wszGBK = new unsigned short[len+1];
        memset(wszGBK, 0, len * 2 + 2);
        MultiByteToWideChar(CP_UTF8, 0, content, -1, (LPWSTR)wszGBK, len);
		len = WideCharToMultiByte(CP_ACP, 0, (LPCWSTR)wszGBK, -1, NULL, 0, NULL, NULL);  
        char *szGBK=new char[len + 1];
        memset(szGBK, 0, len + 1);
        WideCharToMultiByte (CP_ACP, 0, (LPCWSTR)wszGBK, -1, szGBK, len, NULL,NULL);
		cout<<szGBK;
		of<<szGBK;
		strnset(content,'\0',100000);
		delete []wszGBK;
		delete [] szGBK;
	}
	}
	else
	{
		memset(type,'\0',512);
		recv(sock,type,512,0);
		cout<<type;
		of<<type;
		p=strstr(type,"gb2312");
		if(p)
		{
			while(recv(sock,content,100000,0))
			{
				cout<<content;
				of<<content;
				strnset(content,'\0',100000);
			}
		}
		else
		{
           while(recv(sock,content,100000,0)>0)
	       {
		       int len=MultiByteToWideChar(CP_UTF8, 0, content, -1, NULL,0);
               unsigned short * wszGBK = new unsigned short[len+1];
               memset(wszGBK, 0, len * 2 + 2);
               MultiByteToWideChar(CP_UTF8, 0, content, -1, (LPWSTR)wszGBK, len);
		       len = WideCharToMultiByte(CP_ACP, 0, (LPCWSTR)wszGBK, -1, NULL, 0, NULL, NULL);  
               char *szGBK=new char[len + 1];
               memset(szGBK, 0, len + 1);
               WideCharToMultiByte (CP_ACP, 0, (LPCWSTR)wszGBK, -1, szGBK, len, NULL,NULL);
		       cout<<szGBK;
		       of<<szGBK;
		       strnset(content,'\0',100000);
		       delete []wszGBK;
		       delete [] szGBK;
	       }
		}
	}
	closesocket(sock); 
    WSACleanup();
	of.close();
	cout<<endl;
}
int main()
{
	char url[256];
	cout<<"http://";
	cin>>url;
	getWebPage(url);
	return 0;
}

 对此,又对socket编程产生了兴趣,socket编程魅力无穷。

原文地址:https://www.cnblogs.com/xshang/p/3097589.html