关于从网页上下载数据

  1 #include "curl/curl.h"  
  2 
  3 #include <io.h> 
  4 #include <direct.h>  
  5 #include <stdio.h>
  6 #include <string>
  7 #include <fstream>
  8 #include <stdlib.h> 
  9 
 10 #pragma comment(lib, "libcurl.lib")   
 11 #pragma comment(lib, "wldap32.lib")   
 12 #pragma comment(lib, "ws2_32.lib")   
 13 #pragma comment(lib, "winmm.lib")   
 14 
 15 size_t write_func(void *ptr, size_t size, size_t nmemb, FILE *stream)
 16 {
 17     return fwrite(ptr, size, nmemb, stream);
 18 }
 19 
 20 
 21 std::string analyze_name;
 22 std::string analyze_url;
 23 std::string cplocation;
 24 std::string httpurl;
 25 
 26 //得到每条链接的短地址和名字
 27 void get_name_url(std::string line)
 28 {
 29     //File d;
 30     char *linkhead = "<a href=";
 31     char *linkmiddle = """;
 32     char *linktail = "</a>";
 33     std::string head = "<a href=";
 34 
 35     char *s = strstr((char*)line.data(), linkhead);
 36     for (int i = 0; i<9; i++)
 37         *s++;
 38     int urloffset = 0;
 39     while (*s != *linkmiddle)
 40     {
 41         *s++;
 42         urloffset++;
 43     }
 44     analyze_url = line.substr(line.find(head) + 9, urloffset);
 45     for (int i = 0; i<2; i++)
 46         *s++;
 47     int nameoffset = 0;
 48     while (*s != *linktail)
 49     {
 50         *s++;
 51         nameoffset++;
 52     }
 53     analyze_name = line.substr(line.find(head) + 11 + urloffset, nameoffset);
 54 
 55     //return d;
 56 }
 57 
 58 //根据http网址下载网页到本机指定位置
 59 int http_get(std::string httpurl, std::string location)
 60 {
 61     CURL *curl;
 62     CURLcode res;
 63     FILE *outfile;
 64     errno_t err;
 65     err = fopen_s(&outfile, location.c_str(), "wb");
 66     //outfile = fopen(location.c_str(), "wb");
 67     const char *url = httpurl.c_str();
 68     curl = curl_easy_init();
 69     if (curl)
 70     {
 71         curl_easy_setopt(curl, CURLOPT_URL, url);
 72         curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile);
 73         curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_func);
 74         res = curl_easy_perform(curl);
 75         fclose(outfile);
 76         curl_easy_cleanup(curl);
 77     }
 78     return 0;
 79 }
 80 
 81 
 82 int linkextract(std::string httpfile, std::string httpurl, std::string cplocation)
 83 {
 84     std::ifstream infile;
 85     std::string line, link, name;
 86     infile.open(httpfile, std::ios::in);
 87     std::string directory = "alt="[DIR]"";
 88     std::string file = "alt="[   ]"";
 89 
 90     while (!infile.eof())
 91     {
 92         getline(infile, line, '
');
 93         //if (line.find(directory) != -1)//目录
 94         //{
 95         //    get_name_url(line);
 96         //    analyze_name = analyze_name.substr(0, analyze_name.length() - 1);
 97         //    std::string content = cplocation + "/" + analyze_name;
 98         //    _mkdir((char*)content.c_str());
 99         //    httpurl = httpurl + analyze_url;
100         //    cplocation = cplocation + "/" + analyze_name;
101         //    http_get(httpurl, cplocation + "/" + analyze_name + ".txt");
102         //    linkextract(cplocation + "/" + analyze_name + ".txt", httpurl, cplocation);
103         //}
104         //else if (line.find(file) != -1)//文件
105         {
106             //get_name_url(line);
107             if (line.find("href="Tile")==-1)
108                 continue;
109 
110             char *linkhead = "<a href=";
111             char *linkmiddle = """;
112             char *linktail = "</a>";
113             std::string head = "<a href=";
114 
115             char *s = strstr((char*)line.data(), linkhead);
116             for (int i = 0; i<9; i++)
117                 *s++;
118             int urloffset = 0;
119             while (*s != *linkmiddle)
120             {
121                 *s++;
122                 urloffset++;
123             }
124             analyze_url = line.substr(line.find(head) + 9, urloffset);
125             for (int i = 0; i<2; i++)
126                 *s++;
127             int nameoffset = 0;
128             while (*s != *linktail)
129             {
130                 *s++;
131                 nameoffset++;
132             }
133             analyze_name = line.substr(line.find(head) + 11 + urloffset, nameoffset);
134 
135 
136             http_get(httpurl + analyze_url, cplocation + "/" + analyze_name);
137         }
138         /*else
139             continue;*/
140     }
141     infile.close();
142 
143     return 0;
144 }
145 
146 
147 
148 
149 int main()
150 {
151     std::string httpurl = "http://服务地址";
152     std::string content = "../DownloadDirectory";
153     _mkdir((char*)content.c_str());
154     std::string localdir = "../DownloadDirectory/DownloadDirectory.txt";
155     http_get(httpurl, localdir);
156     linkextract(localdir, httpurl, content);
157     return 0;
158 }

只能下载href里面的链接文件,而且很多网页不能下载;

其中,需要下载curl并配置路径,关于下载编译curl有很好的博客,暂时找不见了;

原文地址:https://www.cnblogs.com/ningmouming/p/12100447.html