基于深度优先搜索的蜘蛛程序


     这几天发现一个很好的图片网站,美女特多! 就打算下点图片,但是自己下载的话,翻来覆去的太麻烦,所以用找了个蜘蛛来帮忙。
随便在网上查了查,就下载了一个名叫WebSpider的蜘蛛程序。我仔细研究了一下,感觉作者也是写着玩来着,意思意思,网页下载下来
基本就丢了,另外结构上感觉不太满意,所以改改了。
      我大致想的采用双线程,一个UI,一个工作线程,抓取方面采用深度优先搜索,基本思路:得到当前网页,提取下载图片,然后正则表达式匹配网址,然后递归处理!在处理过程中,使用一个集合类来收集处理过的网址防止死循环。代码大致如下:
  1   public bool Process( WebPageState state )
  2      {
  3         state.ProcessStarted       = true;
  4         state.ProcessSuccessfull   = false;
  5
  6          if(level==1)
  7             m_baseUri = state.Uri;
  8         try
  9         {
 10            Console.WriteLine( "Process Uri: {0}", state.Uri.AbsoluteUri );
 11
 12            WebRequest  req = WebRequest.Create( state.Uri );
 13            WebResponse res = null;
 14
 15            try
 16            {
 17               res = req.GetResponse( );
 18
 19               if ( res is HttpWebResponse )
 20               {
 21                  state.StatusCode        = ((HttpWebResponse)res).StatusCode.ToString( );
 22                  state.StatusDescription = ((HttpWebResponse)res).StatusDescription;
 23               }

 24               if ( res is FileWebResponse )
 25               {
 26                  state.StatusCode        = "OK";
 27                  state.StatusDescription = "OK";
 28               }

 29
 30               if ( state.StatusCode.Equals( "OK" ) )
 31               {
 32                  StreamReader   sr    = new StreamReader( res.GetResponseStream( ) );
 33            
 34                  state.Content        = sr.ReadToEnd( );
 35
 36
 37                  MatchCollection m = RegExUtil.GetMatchRegEx(RegularExpression.SrcExtractor, state.Content, true);
 38                  string Address;
 39                   int k=0;
 40                   for (k = 0; k < m.Count;k++)
 41                   {
 42
 43
 44                       Address = m[k].Groups[1].ToString();
 45                       Uri uri = new Uri(state.Uri, m[k].Groups["url"].ToString());
 46                       // statusBar.Text = "Address: " + Address;
 47                       if (!m_pages.Contains(uri.AbsoluteUri))
 48                       {
 49                           m_pages.Add(uri.AbsoluteUri);
 50                           DownloadImage(state.Uri, Address);
 51                           if (this.ContentHandler != null)
 52                           {
 53                               state.mes.MaxProgress = m.Count;
 54                             
 55                               state.mes.Progress = k+1;
 56                               state.mes.Result = state.Uri.AbsoluteUri;
 57                               state.mes.Status = TaskStatus.Running;
 58                               state.mes.Message = "当前共有图片下载数"+m.Count+"  现在正在下载第"+state.mes.Progress.ToString()+"图片" + Address;
 59                               ContentHandler.Invoke(state);
 60                           }

 61                       }

 62
 63                     
 64                   }

 65               
 66                      int counter = 0;
 67                      Match mm= RegExUtil.GetMatchRegEx(RegularExpression.UrlExtractor, state.Content);
 68
 69                      while (mm.Success)
 70                      {
 71                          Uri uri = new Uri(state.Uri, mm.Groups["url"].ToString());
 72                          if (ValidPage(uri) && !m_pages.Contains(uri.AbsoluteUri))
 73                          {
 74                              if (level > 10)
 75                                  return true;
 76                              counter++;
 77                              level++;
 78                              WebPageState statec = new WebPageState(uri);
 79                              m_pages.Add(uri.AbsoluteUri);
 80                              Process(statec);
 81                          }

 82
 83
 84                          mm = mm.NextMatch();
 85                      }

 86               
 87               }

 88
 89               state.ProcessSuccessfull = true;
 90            }

 91            catch( Exception ex )
 92            {
 93               HandleException( ex, state );
 94            }

 95            finally
 96            {
 97               if ( res != null )
 98               {
 99                  res.Close( );
100               }

101            }

102         }

103         catch (Exception ex)
104         {
105            Console.WriteLine( ex.ToString( ) );
106         }

107         Console.WriteLine( "Successfull: {0}", state.ProcessSuccessfull );
108
109         return state.ProcessSuccessfull;
110      }

111      #endregion
112
113
114       private void DownloadImage(Uri m_bb,string imgUri)
115       {
116           Uri imageUri = null;
117           string ext = null;
118           string outFile = null;
119
120
121           try
122           {
123               imageUri = new Uri(m_bb, imgUri);
124
125               ext = StrUtil.RightLastIndexOf(imageUri.AbsoluteUri, ".").ToLower();
126               outFile = "temp\\img" + (m_fileId+++ "." + ext;
127
128               if ("jpg|jpeg|swf".IndexOf(ext) > -1)
129               {
130                   WebClient web = new WebClient();
131                   web.DownloadFile(imageUri.AbsoluteUri,outFile);
132                 //  byte[] image=web.DownloadData(imageUri);
133                   
134                   if (ext == "swf")
135                   {
136                       //m_graphicViewerWriter.WriteLine("<object classid='clsid:D27CDB6E-AE6D-11cf-96B8-444553540000' codebase='http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,29,0' width='150' height='100'>");
137                       //m_graphicViewerWriter.WriteLine("<param name='movie' value='" + outFile + "'>");
138                       //m_graphicViewerWriter.WriteLine("<param name=quality value=high>");
139                       //m_graphicViewerWriter.WriteLine("<embed src='" + outFile + "' quality=high pluginspage='http://www.macromedia.com/shockwave/download/index.cgi?P1_Prod_Version=ShockwaveFlash' type='application/x-shockwave-flash' width='150' height='100'></embed>");
140                       //m_graphicViewerWriter.WriteLine("</object>");
141                   }

142                   else
143                   {
144                       // m_graphicViewerWriter.WriteLine( "<img src='file://" + outFile + "' /><br />");
145                       //img" + ( m_fileId++ ) + "." + ext;
146                       //m_graphicViewerWriter.WriteLine("<img src='img" + (m_fileId - 1) + "." + ext + "' /><br />");
147                   }

148               }

149           }

150           catch (Exception)
151           {
152              // m_graphicViewerWriter.WriteLine("could not download img: " + imageUri.AbsoluteUri);
153           }

154       }
   现在基本可以下载图片了,不过感觉要优化的地方较多! 递归的层级暂时没有控制,性能也是一般,代码的结构还是比较乱,后续再重构了!
原文地址:https://www.cnblogs.com/jacky0952/p/spider.html