下载生物信息

http://www.catalogueoflife.org/col/browse/classification

这是一个国外的生物信息网站

今天的代码可以抓取指定分类的信息(id,学名)

没有把多线程写进去,略失败...

运用:webclient,regex,io

项目在>>>开源中国

 1 using System;
 2 using System.Collections.Generic;
 3 using System.ComponentModel;
 4 using System.Data;
 5 using System.Drawing;
 6 using System.Linq;
 7 using System.Text;
 8 using System.Threading.Tasks;
 9 using System.Windows.Forms;
10 using System.Net;
11 using System.Text.RegularExpressions;
12 using System.Threading;
13 using System.IO;
14 namespace cateoflife
15 {
16     public partial class Form1 : Form
17     {
18         WebClient wc = new WebClient();
19         int start;
20         int end;
21         string url;
22         string reg;
23         string msg;
24         int now = 1;
25         public Form1()
26         {
27             InitializeComponent();
28 
29         }
30 
31         private void button1_Click(object sender, EventArgs e)
32         {
33             start = int.Parse(textBox2.Text);
34 
35             FileInfo fifo = new FileInfo(start+".txt");
36             FileStream fs= fifo.OpenWrite();
37             StreamWriter w = new StreamWriter(fs);
38             w.BaseStream.Seek(0, SeekOrigin.End);
39 
40             end=(int.Parse(textBox3.Text)==0)?99999:int.Parse(textBox3.Text);
41             url = textBox1.Text;
42             reg = textBox4.Text;
43             wc.Encoding = Encoding.UTF8;
44             string Htm;
45             for (int i = start; i <= end; i++)
46             {
47                 try
48                 {
49                     Htm = wc.DownloadString(url + i);
50                     foreach (Match m in Regex.Matches(Htm, reg))
51                     {
52                         gettxt(m.ToString());
53                         w.Write(msg);
54                         w.Flush();   
55                     }                    
56                 }
57                 catch (Exception)
58                 {
59                     Htm = wc.DownloadString(url + i);
60                     foreach (Match m in Regex.Matches(Htm, reg))
61                     {
62                         gettxt(m.ToString());
63                         w.Write(msg);
64                         w.Flush();
65                     }               
66                 }                
67                              
68             }
69             w.Close();
70         }
71         void gettxt(string html)
72         {
73             msg=Regex.Match(html,"(?<=/)\d+").ToString()+"	"+Regex.Match(html,"(?<=>)\w+\s*\w+").ToString()+"
";
74         }
75     }
76 }
原文地址:https://www.cnblogs.com/Fadinglemon/p/3737058.html