抓取小程序

    前言 ,想利用小程序导航页面来提升网站的流量,找到  www.xcxdh666.com  该小程序导航网站。

 分析网页

        1 发现网站其实也是用异步分页请求加载数据的  ,所以根本用不着xpath  解析html,直接分析其请求url

        2点击加载更多找到请求,发现其实就 pageNum ,cagegory 两个参数

       3所以直接请求url    带入参数,分析起返回json结果

  编写代码

         1 首先建立接收类型       

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
public class XcxApplet
   {
       public int id { getset; }
 
       public string categoryName { getset; }
 
       public string name { getset; }
 
       public string saomaUrl { getset; }
 
       public string sum { getset; }
 
       public string logoUrl { getset; }
   }
 
   public class Result
   {
       public List<XcxApplet> dataList { getset; }
       public string category { getset; }
       public int  status { getset; }
       public int pageNum { getset; }
   }

  

     2  封装请求页面方法

  

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
public static string GetPostPage(this string posturl, string postData)
         {
             Encoding encoding = Encoding.UTF8;
             byte[] data = null;
             if (!string.IsNullOrEmpty(postData)) data = encoding.GetBytes(postData);
             try
             {
                 // 设置参数
                 var request = WebRequest.Create(posturl) as HttpWebRequest;
                 if (request == nullreturn string.Empty;
                 var cookieContainer = new CookieContainer();
                 request.CookieContainer = cookieContainer;
                 request.AllowAutoRedirect = true;
                 request.Method = "POST";
                 request.ContentType = "application/x-www-form-urlencoded";
                 if (data != null)
                 {
                     request.ContentLength = data.Length;
                     Stream outstream = request.GetRequestStream();
                     outstream.Write(data, 0, data.Length);
                     outstream.Close();
                 }
                 //发送请求并获取相应回应数据
                 var response = request.GetResponse() as HttpWebResponse;
                 if (response == nullreturn string.Empty;
 
                 //直到request.GetResponse()程序才开始向目标网页发送Post请求
                 Stream instream = response.GetResponseStream();
                 if (instream == nullreturn string.Empty;
                 var sr = new StreamReader(instream, encoding);
                 //返回结果网页(html)代码
                 string content = sr.ReadToEnd();
                 string err = string.Empty;
                 //Response.Write(content);
                 return content;
             }
             catch (Exception ex)
             {
                 string err = ex.Message;
                 return string.Empty;
             }
         }

  3  图片url处理   思路就是要将其返回的url 请求下载到本地或者上传到自己对应的图片服务器,

           我这里是用七牛云存储img的 ,这里你可以改成下载到本地 返回本地的url就好。

  

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
public string  QiniuUplod(string imgurl)
        {
               
            var accessKey = "你的accesskey";
            var secretKey = "你的secretkey";
 
            // 生成(上传)凭证时需要使用此Mac
            // 这个示例单独使用了一个Settings类,其中包含AccessKey和SecretKey
            // 实际应用中,请自行设置您的AccessKey和SecretKey
            Mac mac = new Mac(accessKey, secretKey);
            string bucket = "siyouku";
            string saveKey = imgurl.Substring(imgurl.LastIndexOf('/')+1,imgurl.Length- imgurl.LastIndexOf('/')-1);
 
 
            // 使用前请确保AK和BUCKET正确,否则此函数会抛出异常(比如code612/631等错误)
            Qiniu.Common.Config.AutoZone(accessKey, bucket, false);
 
 
            // 上传策略,参见
            // https://developer.qiniu.com/kodo/manual/put-policy
            PutPolicy putPolicy = new PutPolicy();
            // 如果需要设置为"覆盖"上传(如果云端已有同名文件则覆盖),请使用 SCOPE = "BUCKET:KEY"
             putPolicy.Scope = bucket + ":" + saveKey;
            putPolicy.Scope = bucket;
            // 上传策略有效期(对应于生成的凭证的有效期)         
            putPolicy.SetExpires(3600);
            // 上传到云端多少天后自动删除该文件,如果不设置(即保持默认默认)则不删除
            //putPolicy.DeleteAfterDays = 1;
 
            // 生成上传凭证,参见
            // https://developer.qiniu.com/kodo/manual/upload-token           
            string jstr = putPolicy.ToJsonString();
            string token = Auth.CreateUploadToken(mac, jstr);
            try
            {
                 
                var wReq = System.Net.WebRequest.Create(imgurl) as System.Net.HttpWebRequest;
                var resp = wReq.GetResponse() as System.Net.HttpWebResponse;
                using (var stream = resp.GetResponseStream())
                {
                    // 请不要使用UploadManager的UploadStream方法,因为此流不支持查找(无法获取Stream.Length)
                    // 请使用FormUploader或者ResumableUploader的UploadStream方法
                    FormUploader fu = new FormUploader();
                    var result = fu.UploadStream(stream, saveKey, token);
                    var x = Newtonsoft.Json.JsonConvert.DeserializeObject<QiniuResult>(result.Text);
                    return $"http://img.siyouku.cn/{x.key}";
                }
            }
            catch (Exception ex)
            {
                return "";
            }
 
  
        }

  

   4 最后是请求主体方法 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
public ActionResult GetxcxList()
        {
            Stopwatch watch = new Stopwatch();//监控抓取耗时
            watch.Start();
            //https://www.xcxdh666.com/pageList.htm?pageNum=0  dataList
            var result = new Result();
 
            for (int j = 0; j <54; j++)
            {
                string url =
                    $"https://www.xcxdh666.com/pageList.htm?pageNum={j}";
 
                var str = url.GetPostPage(null);//HttpWebRequest 请求页面
                if (str != null)
                {
                    result = str.JsonConvert<Result>();  //string   的序列化扩展方法
                }
 
                result.dataList.ForEach(i =>
                {
                    if (!Db.Applet.Any(x => x.Name == i.name))//判断重复插入
                    {
                        var x = new Applet()
                        {
                            CategoryName = string.IsNullOrEmpty(i.categoryName) ? "其它" : i.categoryName,
                            Name = i.name,
                            SaomiaoUrl = QiniuUplod($"http://img.xcxdh666.com/wxappnav/{i.saomaUrl}"),
                            Summary = i.sum,
                            LogoUrl = QiniuUplod($"http://img.xcxdh666.com/wxappnav/{i.logoUrl}"),
                            SortNum = j,
                            CreateUser = "wenqing",
                            CreateTime = DateTime.Now
 
                        };
                        Db.Applet.Add(x);
                    }
 
                });
 
                Db.SaveChanges();
 
 
            }
            watch.Stop();
            return Content("爬取完成!本次请求总共耗时:"+ watch.ElapsedMilliseconds);
        }
    }

  

ok  到这里就全部抓取完成

         这里附上 展示地址  http://siyouku.cn/Applet

原文地址:https://www.cnblogs.com/zzp0320/p/7878701.html