抓取网页(table)信息(导出到数据库中、excel中)

1、存储数据到数据库中:

  1)获取网页中table中的列名,作为数据库表的列名(@"<th scope=""col"">";和"</th>";是table中的包括列名的标签)

/// <summary>
        /// 获取列名
        /// </summary>
        /// <param name="sbHtmltext"></param>
        public void GetColomnNumAndName(string sbHtmltext)
        {
            int i = 0;
            for (i = 0; ; i++)
            {
                string ColomnName = "";
                string StartColoumnName = @"<th scope=""col"">";
                string EndColoumnName = "</th>";
                int StartColoumnNameIndex = sbHtmltext.IndexOf(StartColoumnName);
                if (StartColoumnNameIndex > -1)
                {
                    int EndColoumnNameIndex = sbHtmltext.IndexOf(EndColoumnName);
                    int longs = EndColoumnNameIndex - StartColoumnNameIndex - StartColoumnName.Length;
                    ColomnName = sbHtmltext.Substring(StartColoumnNameIndex + StartColoumnName.Length, longs);
                    dt.Columns.Add(ColomnName.ToString().Trim());
                    sbHtmltext = sbHtmltext.Remove(0, (EndColoumnNameIndex + EndColoumnName.Length));
                }
                else
                {
                    break;
                }
            }
        }

  2)获取table中的数据

/// <summary>
        /// 获取数据
        /// </summary>
        /// <param name="sbHtmltext"></param>
        public void GetValue(string sbHtmltext)
        {
            //去掉页面头
            string StartColoumnName = @"<table cellspacing=""0"" rules=""all"" border=""1"" id=""ctl00_ContentPlaceHolder1_gvChanpin"" style=""border-collapse:collapse;"">";
            string EndColoumnName = @"<td colspan=""14""><table border=""0"">";
            string StartValue = "<td>";
            string EndValue = "</td>";
            int StartColoumnNameIndex = sbHtmltext.IndexOf(StartColoumnName);
            int EndColoumnNameIndex = sbHtmltext.IndexOf(EndColoumnName);
            int longs = EndColoumnNameIndex - StartColoumnNameIndex - StartColoumnName.Length;
            string sbHtmltextChild = sbHtmltext.Substring(StartColoumnNameIndex + StartColoumnName.Length, longs);//删除不包含数据的上下元素
            for (int i = 0; ; i++)
            {
                string Value = "";
                int StartChild = sbHtmltextChild.IndexOf(StartValue);
                if (StartChild > -1)
                {
                    DataRow dr = dt.NewRow();
                    for (int j = 0; ; j++)
                    {
                        int StartValueIndex = sbHtmltextChild.IndexOf(StartValue);
                        if (StartValueIndex > -1 && StartValueIndex != 18)
                        {
                            int EndValueIndex = sbHtmltextChild.IndexOf(EndValue);
                            int longsChild = EndValueIndex - StartValueIndex - StartValue.Length;
                            Value = sbHtmltextChild.Substring(StartValueIndex + StartValue.Length, longsChild).Replace("\r\n", "").Replace("\t", "").Replace("\"", """).Trim();
                            dr[j] = Value.ToString().Trim();
                            sbHtmltextChild = sbHtmltextChild.Remove(0, (EndValueIndex + EndValue.Length));
                        }
                        else
                        {
                            break;
                        }
                    }
                    dt.Rows.Add(dr);
                    int removelen = sbHtmltextChild.IndexOf(StartValue);
                    if (removelen > -1)
                    {
                        sbHtmltextChild = sbHtmltextChild.Remove(0, removelen);
                    }
                }
                else
                {
                    break;
                }
            }
        }

    3)button按钮是触发事件,调用方法:

 try
            {
                if (txttablename.Text== "")
                {
                    MessageBox.Show("请输入表名!");
                    return;
                }
                string sbHtmltext = WBdata.DocumentText;//获取所有页面元素
                GetColomnNumAndName(sbHtmltext);//获取列名
            GetValue(sbHtmltext);//获取数据            
                if (dt != null && dt.Rows.Count > 0)
                    {
                        string descTableName = txttablename.Text.Trim();
                        string str = bll.SourceExcel_Import_Web(descTableName, dt);
                        dt.Reset();//重置数据容器DataTable
                        MessageBox.Show(str);

                        // 定位控件webBrowser滚动条的位置
                        HtmlDocument document = this.WBdata.Document;
                        document.Window.ScrollTo(750, 1200);
                    }
                    else
                    {
                        MessageBox.Show("未采集到数据!");
                    }
            }
                catch (Exception ex)
                {
                    MessageBox.Show(ex.Message);
                }

  4)DAL层功能的实现

/// 导入数据
        /// <summary>
        /// </summary>
        /// <param name="descTableName"></param>
        /// <param name="dtSource"></param>
        /// <returns></returns>
        public string SourceExcel_Import_Web(string descTableName, DataTable dtSource)
        {
            string succ = "";
            try
            {
                string column = "";
                for (int i = 0; i < dtSource.Columns.Count; i++)
                {
                    if (i == 0)
                    {
                        column += dtSource.Columns[i].ColumnName + " varchar(max)";
                    }
                    else
                    {
                        column += "," + dtSource.Columns[i].ColumnName + " varchar(max)";
                    }
                }
              
                string strsql = "if not exists(select * from sys.objects where name ='" + descTableName + "')begin create table " + descTableName + "(" + column + ")end ";
                HZ.Data.DbHelperFactory.Create(connectWeb).ExecuteNonQuery(strsql);
          //HZ.Data.DbHelperFactory.Create(connectWeb).ExecuteNonQuery(strsql)是封装好的DBHelper
SqlBulkCopyOptions sqlBulkCopyOptions; sqlBulkCopyOptions = SqlBulkCopyOptions.FireTriggers; SqlBulkCopy DTS = new System.Data.SqlClient.SqlBulkCopy(connectWeb, sqlBulkCopyOptions); DTS.NotifyAfter = 1; DTS.DestinationTableName = descTableName; DTS.BulkCopyTimeout = 60000000; DTS.WriteToServer(dtSource); succ = "保存成功!" + dtSource.Rows.Count + "条数据"; } catch (Exception ex) { succ = ex.Message; } return succ; } }

  总结:应该先在数据库中手动的创建一个数据库,表名是手动输入的。。

2、导出excel

  1)导出excel的方法。(common.file)

 private static Form _openerForm;
        /// <summary>
        /// 所属窗体
        /// </summary>
        public static Form OpenerForm
        {
            set { _openerForm = value; }
            get { return _openerForm; }
        }

        /// <summary>
        /// 导出到excel
        /// </summary>
        /// <param name="dataTable"></param>
        /// <returns></returns>
        public static bool ExportDataTableToExcel(System.Data.DataTable dataTable)
        {
            SaveFileDialog saveFileDialog = new SaveFileDialog();
            saveFileDialog.Filter = "Execl files (*.xls)|*.xls";
            saveFileDialog.FilterIndex = 0;
            saveFileDialog.RestoreDirectory = true;
            saveFileDialog.CreatePrompt = false;
            saveFileDialog.Title = "导出Excel文件到";

            if (saveFileDialog.ShowDialog() == DialogResult.Cancel)
            {
                return false;
            }

            excel9.Application oXL;
            excel9._Workbook oWB;
            excel9._Worksheet oSheet;
            excel9.Range oRng;

            try
            {
                oXL = new excel9.Application();
                oXL.Visible = false;

                oWB = (excel9._Workbook)(oXL.Workbooks.Add(Missing.Value));
                oSheet = (excel9._Worksheet)oWB.ActiveSheet;

                int _RowCount = dataTable.Rows.Count;
                int _ColumnCount = dataTable.Columns.Count;

                for (int i = 0; i < _ColumnCount; i++)
                {
                    oSheet.Cells[1, i + 1] = dataTable.Columns[i].Caption;
                }

                for (int j = 0; j < _RowCount; j++)
                {
                    for (int k = 0; k < _ColumnCount; k++)
                    {
                        oSheet.Cells[j + 2, k + 1] = dataTable.Rows[j].ItemArray[k].ToString();
                    }
                }

                string ExcelHeader = GetExcelHeader(_ColumnCount);

                oWB.SaveAs
                (
                    saveFileDialog.FileName,
                    excel9.XlFileFormat.xlExcel7,
                    Type.Missing,
                    Type.Missing,
                    Type.Missing,
                    Type.Missing,
                    excel9.XlSaveAsAccessMode.xlExclusive,
                    Type.Missing,
                    Type.Missing,
                    Type.Missing,
                    Type.Missing
                );

                oWB.Close(null, null, null);
                oXL.Workbooks.Close();
                oXL.Quit();

                return true;
            }
            catch (Exception e)
            {
                MessageBox.Show(e.ToString());
                return false;
            }
            finally
            {
            }
        }

  

  private static string GetExcelHeader(int number)
        {
            string[] Header =
                new string[] 
                { 
                    " ", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", 
                    "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "W", 
                    "Z" 
                };
            if (number <= 26)
            {
                return Header[number];
            }
            else
            {
                return GetExcelHeader(number / 26) + Header[number % 26];
            }
        }

    2)button按钮下的方法

      try
            {               
                string sbHtmltext = WBdata.DocumentText;//获取所有页面元素
              GetColomnNumAndName(sbHtmltext);//获取列名
                GetValue(sbHtmltext);//获取数据            
                //判断导出是否成功
                if (common.File.ExportDataTableToExcel(dt))
                {
                    MessageBox.Show("导出成功!");
                    dt.Reset();

                    // 定位控件webBrowser滚动条的位置
                    HtmlDocument document = this.WBdata.Document;
                    document.Window.ScrollTo(750, 1000);
                }
                else
                {
                    MessageBox.Show("导出数据失败!");
                }
            }
            catch(Exception ex)
            {
                MessageBox.Show(ex.Message);
            }
        

  

原文地址:https://www.cnblogs.com/SnailWalk/p/2965492.html