c#抓取网站数据

string url = "http://www.123.com/fast_view?a=1&gameId=25&areaId=0&serverId=0";
            string reffer = "http://www.123.com/";

            string html = GetHTML(url,reffer,10*1000);
            int tableStart = html.IndexOf("<table");
            int tableEnd = html.IndexOf("</table>");

            DataTable dt = new DataTable();
            dt.Columns.Add("ServerName", typeof(System.String));
            dt.Columns.Add("GoodsName", typeof(System.String));
            dt.Columns.Add("Price", typeof(System.String));
            dt.Columns.Add("Qty", typeof(System.String));
            dt.Columns.Add("Id", typeof(System.String));

            if (tableStart != -1 && tableEnd!=-1 && tableEnd>tableStart)
            {
                string tableHtml = html.Substring(tableStart, tableEnd - tableStart + 8);
                System.Text.RegularExpressions.MatchCollection trs = System.Text.RegularExpressions.Regex.Matches(html, "<tr[^>]*>(.*?)</tr>", System.Text.RegularExpressions.RegexOptions.Singleline | System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                for (int i = 0; i < trs.Count; i++)
                {
                    System.Text.RegularExpressions.MatchCollection tds = System.Text.RegularExpressions.Regex.Matches(trs[i].Value, "<td[^>]*>(.*?)</td>", System.Text.RegularExpressions.RegexOptions.Singleline | System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    if (tds.Count < 8) continue;
                    DataRow dr = dt.NewRow();
                    dr["ServerName"] = System.Text.RegularExpressions.Regex.Match(tds[0].Value, @"SelfTextCut2('([^']*)'", System.Text.RegularExpressions.RegexOptions.IgnoreCase | System.Text.RegularExpressions.RegexOptions.Singleline).Groups[1].Value;
                    dr["GoodsName"] = System.Text.RegularExpressions.Regex.Match(tds[2].Value, @"SelfTextCut2('([^']*)'", System.Text.RegularExpressions.RegexOptions.IgnoreCase | System.Text.RegularExpressions.RegexOptions.Singleline).Groups[1].Value;
                    dr["Price"] = System.Text.RegularExpressions.Regex.Match(tds[5].Value, @"parseFloat(([^)]*))", System.Text.RegularExpressions.RegexOptions.IgnoreCase | System.Text.RegularExpressions.RegexOptions.Singleline).Groups[1].Value;
                    dr["Qty"] = System.Text.RegularExpressions.Regex.Replace(tds[6].Value, "<[^>]*>", "", System.Text.RegularExpressions.RegexOptions.IgnoreCase | System.Text.RegularExpressions.RegexOptions.Singleline);
                    dr["Id"] = System.Text.RegularExpressions.Regex.Match(tds[7].Value, @"dl('(d+)')", System.Text.RegularExpressions.RegexOptions.IgnoreCase | System.Text.RegularExpressions.RegexOptions.Singleline).Groups[1].Value;
                    dt.Rows.Add(dr);
                }
            }

================================================================================================================================

附上一个小小的GETHTML,嘎嘎

public static string GetHTML(string strUrl, string Reffer, int Timeout)
        {
            try
            {
                //构造请求
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);

                request.Method = "GET";
                request.ServicePoint.Expect100Continue = false;

                //请求头
                request.UserAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36";
                request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
                request.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
                request.Headers.Add("Accept-Encoding", "gzip, deflate");
                request.KeepAlive = false;

                request.Referer = Reffer;
                request.Timeout = Timeout;

                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                Stream stream = null;
                if (response.ContentEncoding == "gzip")
                {
                    System.IO.Compression.GZipStream gzsStream = new System.IO.Compression.GZipStream(response.GetResponseStream(), System.IO.Compression.CompressionMode.Decompress);
                    stream = gzsStream;
                }
                else
                {
                    stream = response.GetResponseStream();
                }

                string strResult = new StreamReader(stream, System.Text.Encoding.GetEncoding("gb2312")).ReadToEnd();
                response.Close();

                return strResult;
            }
            catch (Exception err)
            {
                return "Error:" + err.ToString();
            }
        }
原文地址:https://www.cnblogs.com/ghelement/p/4512012.html