简易网页采集器的实现

--------------------------------------------------------------------------
-----------------------------Cryking 原创-----------------------------
-----------------------转载请注明出处,谢谢!------------------------ 

自己写的一个扫描网址标题的小工具.

功能:遍历指定范围的IP,根据IP扫描网页的标题,并记录(支持二级重定向网页的扫描)

         自动记录采集日志到D盘的net_collect.log文件中.

类型:控制台程序

实现语言:C#

需要的环境: .NET 3.5

可选的环境:Oracle数据库

相关的缺省值说明:

缺省(直接按回车即是缺省值)Oracle数据库用户:scott

缺省Oracle数据库密码:tigger

缺省Oracle数据库连接标识符:orcl  (即TNSNAME名称)

缺省的http连接超时时间:6秒

缺省启用数据库来记录采集到的信息

缺省不启用扫描完成后自动关机

 (篇幅原因,数据库连接类这里就不贴了)

主函数代码如下:

static void Main(string[] args)
        {
            try
            {
                string user = "scott";
                string pwd = "tigger";
                string tns = "orcl";

                Console.WriteLine("***************简易网址扫描器V1.0*****************");
                Console.WriteLine("**************Created  By Cryking*****************");
                Console.WriteLine("******************QQ:278676125********************");
                Console.WriteLine("**************************************************");
                Console.WriteLine("请设置超时时间(若网络环境较差,建议设大一点,如100秒)(单位/秒):");
                timeOut = Int32.Parse(Console.ReadLine());
                Console.WriteLine("扫描完成后是否自动关机(Y/N)?");
                if (Regex.IsMatch(Console.ReadLine(), "(?i)[y]")) shutDownFlag = 1;
                Console.WriteLine("是否启用数据库支持(不启用则只写日志文件),Y/N?:");
                if (Regex.IsMatch(Console.ReadLine(), "(?i)[n]")) DBFlag = 0;
                if (DBFlag == 1)
                {
                    Console.WriteLine("请输入Oracle数据库连接用户名:");
                    user = Console.ReadLine();
                    user = user == string.Empty ? "scott" : user;
                    Console.WriteLine("请输入Oracle数据库连接密码:");
                    pwd = string.Empty;
                    ConsoleKeyInfo info;
                    do
                    {
                        info = Console.ReadKey(true);
                        if (info.Key != ConsoleKey.Enter && info.Key != ConsoleKey.Backspace && info.Key != ConsoleKey.Escape && info.Key != ConsoleKey.Tab && info.KeyChar != '\0')
                        { pwd += info.KeyChar; Console.Write('*'); }
                    } while (info.Key != ConsoleKey.Enter);
                    pwd = pwd == string.Empty ? "tigger" : pwd;
                    Console.WriteLine();
                    Console.WriteLine("请输入Oracle数据库连接标识符(TNSNAME):");
                    tns = Console.ReadLine();
                    tns = tns == string.Empty ? "orcl" : tns;
                    if (!DBAccess.DBConnect(user, pwd, tns))
                    {
                        MessageBox.Show("数据库连接失败!", "错误001", MessageBoxButtons.OK, MessageBoxIcon.Error);
                        System.Diagnostics.Process.GetCurrentProcess().Kill();
                    }
                    Console.WriteLine("数据库连接成功!");
                    if (DBAccess.selectStr("select count(*) from user_objects where object_name='NET_COLLECT' ") == "0")
                    {
                        Console.WriteLine("开始创建表(NET_COLLECT),请等待...");
                        if (0 == DBAccess.DBExecSql(@"create table NET_COLLECT(
    IP          VARCHAR2(30) not null,
    PORT        NUMBER default 80,
    TITLE       VARCHAR2(4000),
    URL         VARCHAR2(2000),
    COLLECTDATE DATE default sysdate
)"))
                            Console.WriteLine("表(NET_COLLECT)创建成功!");
                        else
                        {
                            Console.WriteLine("表(NET_COLLECT)创建失败,请参照说明,先手工创建表(NET_COLLECT)!");
                            System.Diagnostics.Process.GetCurrentProcess().Kill();
                        }
                    }
                }
                string Scan = "";
                Console.WriteLine("请输入扫描范围(如:0.0.0.0-10.10.10.10)");
                Scan = Console.ReadLine();
                string[] tmpIp = Scan.Trim().Split('-');
                string[] ipScanScop = allocaIncreament(tmpIp[0], tmpIp[1]);//平均分配IP范围给8个线程
                logFile = new StreamWriter("d:\\net_collect.log", true);
                DateTime startTime = DateTime.Now;
                logFile.WriteLine("开始时间:" + DateTime.Now.ToString());
                //开8个线程跑
                Thread t = new Thread(new ParameterizedThreadStart(ipScan));
                t.Start(ipScanScop[0]);
                Thread t1 = new Thread(new ParameterizedThreadStart(ipScan));
                t1.Start(ipScanScop[1]);
                Thread t2 = new Thread(new ParameterizedThreadStart(ipScan));
                t2.Start(ipScanScop[2]);
                Thread t3 = new Thread(new ParameterizedThreadStart(ipScan));
                t3.Start(ipScanScop[3]);
                Thread t4 = new Thread(new ParameterizedThreadStart(ipScan));
                t4.Start(ipScanScop[4]);
                Thread t5 = new Thread(new ParameterizedThreadStart(ipScan));
                t5.Start(ipScanScop[5]);
                Thread t6 = new Thread(new ParameterizedThreadStart(ipScan));
                t6.Start(ipScanScop[6]);
                Thread t7 = new Thread(new ParameterizedThreadStart(ipScan));
                t7.Start(ipScanScop[7]);

                while (true) { if (8 == flag) break; };
                DBAccess.DBClose();
                TimeSpan ts = DateTime.Now - startTime;
                logFile.WriteLine("结束时间:" + DateTime.Now.ToString());
                logFile.Close();
                Console.WriteLine("总共花费时间:" + ts.ToString());
                if(1==shutDownFlag)
                Process.Start("Shutdown.exe", " -s -t 0"); //完成后自动关机
                Console.ReadKey();
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }

        }


 扫描功能函数:

static void ipScan1(object obj)
        {
            try
            {
                string[] scope = obj.ToString().Split('-');
                if (string.Compare(scope[0].ToString(), scope[1].ToString()) > 0)//交换
                {
                    string tmp = "";
                    tmp = scope[0];
                    scope[0] = scope[1];
                    scope[1] = tmp;
                }
                string[] ipStart = scope[0].ToString().Split('.');
                int i = Int32.Parse(ipStart[0]);
                int j = Int32.Parse(ipStart[1]);
                int k = Int32.Parse(ipStart[2]);
                int g = Int32.Parse(ipStart[3]);

                string[] ipEnd = scope[1].ToString().Split('.');
                int ei = Int32.Parse(ipEnd[0]);
                int ej = Int32.Parse(ipEnd[1]);
                int ek = Int32.Parse(ipEnd[2]);
                int eg = Int32.Parse(ipEnd[3]);
                string html;
                string ip;
                string logBuffer = "";
                for (; i <= ei; i++)
                {
                    if (10 == i || 127 == i) continue;//私有地址
                    if (g == eg && k == ek && j == ej && i == ei) break;
                    for (; j <= 255; j++)
                    {
                        if (192 == i && 168 == j) continue;//私有地址
                        for (; k <= 255; k++)
                        {
                            for (; g <= 255; g++)
                            {
                                {
                                    
                                    ip = i.ToString() + "." + j.ToString() + "." + k.ToString() + "." + g.ToString();

                                    html = GetHtmlInfo(ip, timeOut*1000, Encoding.Default);//采用缺省的编码方式,可能会获得乱码
                                    string title=GetTitle(html);
                                    title = title == string.Empty ? (html.Length > 1000 ? html.Substring(0, 1000) : html) : title;
                                    if (html != string.Empty && html != "无法连接到远程服务器")
                                        if(DBFlag==1)
                                        DBAccess.DBExecSql("insert into net_collect values('" + ip + "',default,'" + title + "','',default)");
                                    Console.WriteLine(ip + " --" + title);
                                    if (logBuffer != html)
                                    {
                                        lock (logFile)
                                        {
                                            myMutex.WaitOne();
                                            logFile.WriteLine("ip:" + ip + " [MSG:]" + title);                                          logBuffer = html;
                                            myMutex.ReleaseMutex();
                                        }
                                    }
                                    logFile.Flush();
                                    countPort++;
                                }
                                count++;

                            }
                            g = 0;
                        }
                        k = 0;
                    }
                    j = 0;
                }
                flag++;
            }
            catch (Exception e) { Console.WriteLine(e.Message); }
            
        }


 

 网页信息获取函数

static string GetHtmlInfo(string url, int timeout, Encoding EnCodeType)
        {

            if (!url.StartsWith("http://") && !url.StartsWith("https://")) { url = "http://" + url; }
            string result = "";
            StreamReader reader = null;
            string temp = "";
            HttpWebRequest request = null;
            HttpWebResponse response = null;
            try
            {
                 request = (HttpWebRequest)HttpWebRequest.Create(url);//初始化WebRequest
                request.Timeout = timeout;
                request.UserAgent = "User-Agent:Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 2.0.40607; .NET CLR 1.1.4322; .NET CLR 3.5.30729;.NET CLR 1.0.3705)";
                request.Accept = "*/*";
                request.AllowAutoRedirect = false;
                request.KeepAlive = true;
                request.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
                 response = (HttpWebResponse)request.GetResponse();//返回来自Internet的响应
                 if (response.StatusCode == System.Net.HttpStatusCode.MovedPermanently)//获取重定向的网页
                 {
                     request = (HttpWebRequest)HttpWebRequest.Create(response.Headers["Location"]);//初始化WebRequest
                     response = (HttpWebResponse)request.GetResponse();//返回来自Internet的响应
                 }

                 if (response.StatusCode == System.Net.HttpStatusCode.OK)
                {
                    StringBuilder builder = new StringBuilder();
                    Stream stream = response.GetResponseStream();
                    reader = new StreamReader(stream, EnCodeType);
                    string tmp = "";
                    while ((temp = reader.ReadLine()) != null){
                        builder.Append(temp);
                        tmp = builder.ToString();
                        if (tmp.IndexOf("</title>") > 0) { break; }//ReadLine是读取整行,所以有时在它后面的很多字符串也会读取
                        builder.Append("\r\n");
                    }

                    result = builder.ToString();
                    response.Close();
                    request.Abort();
                    return result;
                }
                response.Close();
                request.Abort();
                return string.Empty;
            }

            catch (Exception ex){
                return ex.Message;
            }
            finally { if (reader != null) { reader.Close(); } if (response != null) { response.Close(); } if (request != null) { request.Abort(); } }
        }


 

 IP范围分配函数(分配给各个线程)

static string[] allocaIncreament(string tmpIp0, string tmpIp1)//平均分配IP范围给各个线程
        {
            string[] ipResult=new string[8];
            if (string.Compare(tmpIp0,tmpIp1)>0)//交换
            {
                string tmp = "";
                tmp = tmpIp0;
                tmpIp0 = tmpIp1;
                tmpIp1 = tmp;
            }
            string[] startip=tmpIp0.Split('.');
            string[] endip = tmpIp1.Split('.');
            int incre = (Int32.Parse(endip[3]) - Int32.Parse(startip[3]) +
                (Int32.Parse(endip[2]) - Int32.Parse(startip[2])) * 256 +
                (Int32.Parse(endip[1]) - Int32.Parse(startip[1])) * 256 * 256 +
                (Int32.Parse(endip[0]) - Int32.Parse(startip[0])) * 256 * 256 * 256) / 8;

            string tmpIp0End = calcIp(startip, incre);
            ipResult[0] = tmpIp0 + "-" + tmpIp0End;

            string[] t1 = tmpIp0End.Split('.');
            t1[3] = (Int32.Parse(t1[3]) + 1).ToString();
            string tmpIp1End = calcIp(t1, incre);
            if (string.Compare(string.Join(".", t1), tmpIp1End) >= 0)
            {
                ipResult[1] = tmpIp0End + "-" + tmpIp1;
                ipResult[2] = tmpIp1 + "-" + tmpIp1;
                ipResult[3] = ipResult[2];
                ipResult[4] = ipResult[2];
                ipResult[5] = ipResult[2];
                ipResult[6] = ipResult[2];
                ipResult[7] = ipResult[2];
            }
            else
            {
                ipResult[1] = string.Join(".", t1) + "-" + tmpIp1End;

                string[] t2 = tmpIp1End.Split('.');
                t2[3] = (Int32.Parse(t2[3]) + 1).ToString();
                string tmpIp2End = calcIp(t2, incre);
                if (string.Compare(string.Join(".", t2), tmpIp2End) >= 0)
                {
                    ipResult[2] = tmpIp1End + "-" + tmpIp1;
                    ipResult[3] = tmpIp1 + "-" + tmpIp1;
                    ipResult[4] = ipResult[3];
                    ipResult[5] = ipResult[3];
                    ipResult[6] = ipResult[3];
                    ipResult[7] = ipResult[3];
                }
                else
                {
                    ipResult[2] = string.Join(".", t2) + "-" + tmpIp2End;

                    string[] t3 = tmpIp2End.Split('.');
                    t3[3] = (Int32.Parse(t3[3]) + 1).ToString();
                    string tmpIp3End = calcIp(t3, incre);
                    if (string.Compare(string.Join(".", t3), tmpIp3End) >= 0)
                    {
                        ipResult[3] = tmpIp2End + "-" + tmpIp1; ipResult[4] = tmpIp1 + "-" + tmpIp1;
                        ipResult[5] = ipResult[4];
                        ipResult[6] = ipResult[4];
                        ipResult[7] = ipResult[4];
                    }
                    else
                    {
                        ipResult[3] = string.Join(".", t3) + "-" + tmpIp3End;

                        string[] t4 = tmpIp3End.Split('.');
                        t4[3] = (Int32.Parse(t4[3]) + 1).ToString();
                        string tmpIp4End = calcIp(t4, incre);
                        if (string.Compare(string.Join(".", t4), tmpIp4End) >= 0)
                        {
                            ipResult[4] = tmpIp3End + "-" + tmpIp1; ipResult[5] = tmpIp1 + "-" + tmpIp1;
                            ipResult[6] = ipResult[5];
                            ipResult[7] = ipResult[5];
                        }
                       else
                        {
                            ipResult[4] = string.Join(".", t4) + "-" + tmpIp4End;

                            string[] t5 = tmpIp4End.Split('.');
                            t5[3] = (Int32.Parse(t5[3]) + 1).ToString();
                            string tmpIp5End = calcIp(t5, incre);
                            if (string.Compare(string.Join(".", t5), tmpIp5End) >= 0)
                            {
                                ipResult[5] = tmpIp4End + "-" + tmpIp1; ipResult[6] = tmpIp1 + "-" + tmpIp1;
                                ipResult[7] = ipResult[6];
                            }
                            else
                            {
                                ipResult[5] = string.Join(".", t5) + "-" + tmpIp5End;

                                string[] t6 = tmpIp5End.Split('.');
                                t6[3] = (Int32.Parse(t6[3]) + 1).ToString();
                                string tmpIp6End = calcIp(t6, incre);
                                if (string.Compare(string.Join(".", t6), tmpIp6End) >= 0)
                                {
                                    ipResult[6] = tmpIp5End + "-" + tmpIp1; ipResult[7] = tmpIp1 + "-" + tmpIp1;
                                }
                                else
                                {
                                    ipResult[6] = string.Join(".", t6) + "-" + tmpIp6End;

                                    string[] t7 = tmpIp6End.Split('.');
                                    t7[3] = (Int32.Parse(t7[3]) + 1).ToString();
                                    string tmpIp7End = calcIp(t7, incre);
                                    if (string.Compare(string.Join(".", t7), tmpIp7End) >= 0) ipResult[7] = tmpIp6End + "-" + tmpIp1;
                                    else
                                        ipResult[7] = string.Join(".", t7) + "-" + tmpIp1;
                                }
                            }
                        }
                    }
                }
            }
            return ipResult;
        }


 运行的界面如下:

---

工具下载地址:http://pan.baidu.com/share/link?shareid=657915&uk=2449788611

有任何问题及建议,请联系我QQ:278676125

原文地址:https://www.cnblogs.com/javawebsoa/p/3074758.html