.net版 类似火车头的网页采集

最近因工作需要,需写一个类似火车头的web采集器

各位有什么建议啊?

由于正则不会,只能简单的写一个测试代码,代码如下

代码
protected void Button1_Click(object sender, EventArgs e)
{
string content,sql;
WebClient client
= new WebClient();
string content2 = null;
MatchCollection matches;
int iStart, iEnd;
Regex regex
= new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>"); //分组捕获url链接以及对应的标题,一个列表页中有多个网页链接
try
{
for (int i = Convert.ToInt32(exp1.Text); i <= Convert.ToInt32(exp2.Text); i++)
{

bool bl = test("http://souky.eol.cn/HomePage/index_" + i + ".html");

if (bl == false)
{
continue;
}
else
{
content
= client.DownloadString("http://souky.eol.cn/HomePage/index_" + i + ".html");

matches
= regex.Matches(content);
foreach (Match m in matches)
{
if (m.Groups["url"].Value.StartsWith("/HomePage/takeinfo/" + i))
{
tb.Text
+= m.Groups["url"].Value + "\n";
content2
= client.DownloadString("http://souky.eol.cn" + m.Groups["url"].Value);
//int iStart = content2.IndexOf("<td class=\"font14\" style=\"word-wrap:break-word;\">");

if (content2.IndexOf("<td class=\"font14\" style=\"word-wrap:break-word;\">") == -1)
{
iStart
= content2.IndexOf("<div class=\"line_24 pad_c\">");
iEnd
= content2.IndexOf("</div>");
}
else
{
iStart
= content2.IndexOf("<td class=\"font14\" style=\"word-wrap:break-word;\">");
iEnd
= content2.IndexOf("<td height=50>");
}

//tbcontent.Text += content2.Substring(iStart, iEnd - iStart);
sql = "insert into temp (subContent) values('" + NoHTML(content2.Substring(iStart, iEnd - iStart)) + "')";
try
{
ULCode.XSql.MsSql.Execute(sql);
}
catch (Exception EX)
{
continue;
}
finally
{
tb.Text
= "输出";
}
//if (exe(sql)!=1)
//{
// ULCode.Debug.Alert(Page,"123");
// Response.Write("http://souky.eol.cn/HomePage/index_" + i + ".html");
// continue;
//}
//else
//{
// ULCode.XSql.MsSql.Execute(sql);
//}
}
}
}
}
}
catch (Exception ex)
{
tb.Text
= ex.Message;
}
finally
{
client.Dispose();
}

}
private int exe(string sql) {

int IR = ULCode.XSql.MsSql.Execute(sql);
return IR;
}
private bool test(string url) {
HttpWebRequest request
= (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse response;
//request.KeepAlive = false;
try
{
response
= (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.NotFound)
{
response.Close();

return false;

}
response.Close();
return true;
}
catch (Exception ex)
{
//response.Close();
return false;
}

}
//清除HTML函数
public static string NoHTML(string Htmlstring)
{

//删除脚本

Htmlstring
= Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);

////删除HTML
Htmlstring = Regex.Replace(Htmlstring, @"<(/?p|br[^>]*)>;", "[--$1--]", RegexOptions.IgnoreCase);
Htmlstring
= Regex.Replace(Htmlstring,"\"", ""); //去掉引号
//Htmlstring = Regex.Replace(Htmlstring, "\"", ""); //去掉引号
Htmlstring = Regex.Replace(Htmlstring, "", ""); //去掉引号
Htmlstring = Regex.Replace(Htmlstring, "", ""); //去掉引号
Htmlstring = Regex.Replace(Htmlstring, "'", ""); //去掉引号
//Htmlstring = Regex.Replace(Htmlstring, @"\+", ""); //去掉引号
Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);

Htmlstring
= Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);

Htmlstring
= Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);

Htmlstring
= Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);

Htmlstring
= Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);

Htmlstring
= Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);

Htmlstring
= Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);

Htmlstring
= Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);

Htmlstring
= Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);

Htmlstring
= Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);

Htmlstring
= Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);

Htmlstring
= Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);

Htmlstring
= Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
Htmlstring
= Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);

Htmlstring.Replace(
"<", "");

Htmlstring.Replace(
">", "");

//Htmlstring.Replace("<BR>", "\r\n");
//Htmlstring = Regex.Replace(Htmlstring, "<[^>]*?>", "");
Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();

return Htmlstring;

}
原文地址:https://www.cnblogs.com/OK_Blog/p/1822426.html