多线程的一点东西。。

毕业设计准备做搜索方面,所以开始写爬虫程序,想法是这样,从一个网站开始,抓内容,分析页面,获取页面所有链接,将链接放到UrlList列表,然后索引,一直不断循环。这星期一直在学习多线程,下面是抓取页面内容的代码,先做个备忘先。

开始事件,以及线程函数

        private void Start_Click(object sender, EventArgs e)
        {
            startUrl 
= StartUrl.Text.Trim();
            
//获取网页内容的线程数
            if (!string.IsNullOrEmpty(ThreadCount.Text.Trim()))
            {
                threadCount 
= int.Parse(ThreadCount.Text.Trim());
            }
            
else
            {
                threadCount 
= 1;
            }
            
//获取链接线程数
            if (!string.IsNullOrEmpty(GetUrlThreadCount.Text.Trim()))
            {
                getUrlThreadCount 
= int.Parse(GetUrlThreadCount.Text.Trim());
            }
            
else
            {
                getUrlThreadCount 
= 1;
            }

            
if (startUrl == null)
            {
                MessageBox.Show(
"请输入链接地址");
                
return;
            }
            
else
            {
                Regex re 
= new Regex(@"http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?");
                
if (!re.Match(startUrl).Success)
                {
                    MessageBox.Show(
"链接格式错误");
                    
return;
                }
                
else
                {
                    urllist.Url.Add(startUrl);
                    urllist.IsDownload.Add(Encrypt.MD5EncryptStr(startUrl),
0);
                }
            }


            urllist.Url.Add(
"http://www.hao123.com");
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
"http://www.hao123.com"), 0);

            urllist.Url.Add(
"http://www.zhku.com");
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
"http://www.zhku.com"), 0);

            urllist.Url.Add(
"http://www.sina.com");
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
"http://www.sina.com"), 0);

            urllist.Url.Add(
"http://www.zhku.edu.cn");
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
"http://www.zhku.edu.cn"), 0);

            urllist.Url.Add(
"http://www.39.net");
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
"http://www.39.net"), 0);

            urllist.Url.Add(
"http://www.cnblogs.com");
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
"http://www.cnblogs.com"), 0);

            urllist.Url.Add(
"http://www.google.com");
            urllist.IsDownload.Add(Encrypt.MD5EncryptStr(
"http://www.google.com"), 0);

            Thread[] threadPool 
= new Thread[threadCount];
            
int count=3;

            
for (int i = 0; i < count; i++)
            {
                threadPool[i] 
= new Thread(new ParameterizedThreadStart(GetPageContent));
                threadPool[i].Start(i);
            }

                       
while (true)
            {
                
if (!threadPool[0].IsAlive && !threadPool[1].IsAlive && !threadPool[2].IsAlive)
                {
                    listBox1.DataSource 
= PageContentList.PageContents;
                    threadPool[
0].Abort();
                    threadPool[
1].Abort();
                    threadPool[
2].Abort();
                    
break;
                }
            }
        }

        
public void GetPageContent(object startindex)
        {
            
int start = (int)startindex;
            
lock(urllist)
            {
                Monitor.Enter(urllist);
                
int n = 0;
                
if (urllist.IsDownload.Count < threadCount)
                {
                    n 
= urllist.IsDownload.Count;
                }
                
else
                {
                    n 
= threadCount;
                }
                
for (int i = start; i < urllist.IsDownload.Count; i = i + n)
                {
                    
if (i > urllist.IsDownload.Count)
                    {
                        
break;
                    }
                    
string key = Encrypt.MD5EncryptStr(urllist.Url[i]);
                    
                    
if ((int)urllist.IsDownload[key] == 0)
                    {
                        urllist.IsDownload[key] 
= 1;
                        
string sb = null;

                        WebClient client 
= new WebClient();
                        Byte[] read 
= new Byte[1024];
                        read 
= client.DownloadData(urllist.Url[i].Trim().ToString());

                        System.Text.Encoding encoder 
= System.Text.Encoding.GetEncoding("GB2312");
                        sb 
+= encoder.GetString(read, 0, read.Length);

                        
lock (PageContentList)
                        {
                            Monitor.Enter(PageContentList);
                            PageContentList.PageContents.Add(sb);
                            PageContentList.IsAnalyse.Add(Encrypt.MD5EncryptStr(sb), 
0);
                            PageContentList.IsIndexed.Add(Encrypt.MD5EncryptStr(sb), 
0);
                            Monitor.Pulse(PageContentList);
                            Monitor.Exit(PageContentList);
                        }
                    }
                }
                Monitor.Pulse(urllist);
                Monitor.Exit(urllist);
            }
            Thread.Sleep(
500);
        }

链接列表类,页面内容类

    public class UrlList
    {
        
private List<string> url = new List<string>();
        
private Hashtable isDownload = new Hashtable();

        
/// <summary>
        
/// 下载链接
        
/// </summary>
        public List<string> Url
        {
            
get { return url; }
            
set { url = value; }
        }

        
/// <summary>
        
/// 是否为已下载链接0为否,1为是
        
/// </summary>
        public Hashtable IsDownload
        {
            
get { return isDownload; }
            
set { isDownload = value; }
        }
    }

    
public class PageContent
    {
        
private List<string> pageContents = new List<string>();
        
private Hashtable isAnalyse = new Hashtable();
        
private Hashtable isIndexed = new Hashtable();

        
/// <summary>
        
/// 页面内容列表
        
/// </summary>
        public List<string> PageContents
        {
            
get { return pageContents; }
            
set { pageContents = value; }
        }
        
/// <summary>
        
/// 是否分析了页面,0为否,1为是
        
/// </summary>
        public Hashtable IsAnalyse
        {
            
get { return isAnalyse; }
            
set { isAnalyse = value; }
        }
        
/// <summary>
        
/// 是否对页面进行索引,0为否,1为是
        
/// </summary>
        public Hashtable IsIndexed
        {
            
get { return isIndexed; }
            
set { isIndexed = value; }
        }
    }

上面的页面都可以抓取到,可是点击开始后就好卡,不知道什么原因,各路高人帮忙看看,指点一下。小弟感激不尽!

原文地址:https://www.cnblogs.com/coolkiss/p/1374857.html