示例

最近 @甜瓜 (QQ:1069629945) 开发了一套NBA数据采集脚本, 我觉得很赞. 经他允许发布出来和大家分享一些经验:

球员球队: http://data.sports.sohu.com/nba/nba_team_info.php?teamid=1 .. 30

在1到30的循环中抓取球队信息, 球员信息并用id将其关联起来, 脚本如下:

public void Run()
{
    Logger.ClearAll();
    for(int i=1; i<=30; i++)
    {
        Default.Navigate("http://data.sports.sohu.com/nba/nba_team_info.php?teamid="+i);
        Default.Ready();
        var teamid = i;
        var teamname = Default.SelectSingleNode("div.blockA>h2>span");
        Logger.Log(teamname.Text());
        var teamurl = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li>a");
        Logger.Log(teamurl.Text());
        var teamcity = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(1)");
        Logger.Log(teamcity.Text().Replace("主场所在城市:",""));
        var gym = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(2)");
        Logger.Log(gym.Text().Replace("主体育馆:",""));
        var peoplenum = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(3)");
        Logger.Log(peoplenum.Text().Replace("可容纳人数:",""));
        var intonba = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(4)");
        Logger.Log(intonba.Text().Replace("加入NBA时间:",""));
        var champion = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(5)");
        Logger.Log(champion.Text().Replace("获总冠军次数:",""));
        var coach = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(6)");
        Logger.Log(coach.Text().Replace("现任主教练:",""));
        DataManager.AppendData("TEAM",
                               DataEntry.Create()
                               .Set("teamid", teamid+"")
                               .Set("teamname", teamname.Text())
                               .Set("teamurl", teamurl.Text().Replace("主场所在城市:",""))
                               .Set("gym",gym.Text().Replace("主体育馆:",""))
                               .Set("peoplenum", peoplenum.Text().Replace("可容纳人数:",""))
                               .Set("intonba", intonba.Text().Replace("加入NBA时间:",""))
                               .Set("champion", champion.Text().Replace("获总冠军次数:",""))
                               .Set("coach", coach.Text().Replace("现任主教练:",""))
                              );
        Logger.Log(i.ToString());
        var playelist = Default.SelectNodes("div.tab>table tr");
        foreach(var player in playelist)
        {
            var num = player.SelectSingleNode("TD:eq(0)");
            var a = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a");
            var url = a.Attr("href");
            var playerid = Regex.Match(url, @"d+").Value;
            var playerimageurl = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a>img");
            var playername = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>P:eq(0)>A");
            var position = player.SelectSingleNode("TD:eq(2)>SPAN:eq(0)");
            var height = player.SelectSingleNode("TD:eq(3)");
            var weight = player.SelectSingleNode("TD:eq(4)");
            var birth = player.SelectSingleNode("TD:eq(5)");
            var college = player.SelectSingleNode("TD:eq(6)");
            Logger.Log(playerimageurl.Text());
            Logger.Log(playername.Text());
            Logger.Log(position.Text());
            Logger.Log(height.Text());
            Logger.Log(weight.Text());
            Logger.Log(birth.Text());
            Logger.Log(college.Text());
            Logger.Log(playerimageurl.Attr("src"));
            Logger.Log(playerid);
            DataManager.AppendData("player",
                                   DataEntry.Create()
                                   .Set("playerid", playerid)
                                   .Set("teamid", teamid+"")
                                   .Set("playername", playername.Text())
                                   .Set("position", position.Text())
                                   .Set("height",height.Text())
                                   .Set("weight", weight.Text())
                                   .Set("birth", birth.Text())
                                   .Set("college", college.Text())
                                   .Set("num", num.Text())
                                   .Set("playerimageurl",playerimageurl.Attr("src"))
                                  );
        }
    }
}

比赛信息: http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012

脚本如下:

public void Run()
{
    Logger.ClearAll();
    Default.Navigate("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012");
    Default.Ready();
    var games = Default.SelectNodes("div.tab tr>td.e17>span.bluetext>a:contains("技术统计")");

    List<string> urls = new List<string>();
    foreach(var g in games)
    {
        var url = new Uri(new Uri("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-10&season_year=2013"), g.Attr("href")).ToString();
        urls.Add(url.ToString());
    }
    foreach(var url in urls)
    {
        if( Default.Available == false) return;
        Default.Navigate(url);
        Default.Ready();
        var teamNames = Default.SelectNodes("div.blockA>h2");
        var scores = Default.SelectNodes("table.tab04 tr");
        var scoreslist = Default.SelectNodes("table.tab02 tr>td");
        var awayscores = Default.SelectNodes("table.tab02 tr");
        var jiashiscores = Default.SelectSingleNode("table.tab03>TD:eq(0)");
        var logos = Default.SelectNodes("td.logo img");
        var awayid =Regex.Match(logos[0].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value;
        var homeid =Regex.Match(logos[1].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value;
        var homescore=scores[1].Text();
        var awayscore=scores[0].Text();
        var awayscore1=scoreslist[0].Text();
        var awayscore2=scoreslist[1].Text();
        var awayscore3=scoreslist[2].Text();
        var awayscore4=scoreslist[3].Text();
        var homescore1=scoreslist[4].Text();
        var homescore2=scoreslist[5].Text();
        var homescore3=scoreslist[6].Text();
        var homescore4=scoreslist[7].Text();
        var gametime = Default.SelectSingleNode("div.center>h2");

        var jiashiawayscores1="";
        var jiashiawayscores2="" ;
        var jiashiawayscores3 ="";
        var jiashiawayscores4="";
        var jiashihomescores1="";
        var jiashihomescores2="";
        var jiashihomescores3 ="";
        var jiashihomescores4="";

        var td = Default.SelectSingleNode("table.tabBig td:contains("加时赛")");
        if(!td.IsEmpty())
        {

            if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==1)
            {
                jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
                jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
            }
            else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==2)
            {
                jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
                jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
                jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
                jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();

            }
            else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==3)
            {
                jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
                jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
                jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text();
                jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
                jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();
                jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text();

            }
            else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==4)
            {
                jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
                jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
                jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text();
                jiashiawayscores4 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(3)").Text();
                jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
                jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();
                jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text();
                jiashihomescores4 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(3)").Text();

            }


        }


        DataManager.AppendData("GAMESTATIC",
                               DataEntry.Create()
                               .Set("teamid", Regex.Match(logos[0].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value)
                               .Set("gametime",gametime.Text().Replace("开始比赛",""))
                               .Set("score1",awayscore1)
                               .Set("score2", awayscore2)
                               .Set("score3",awayscore3)
                               .Set("score4",awayscore4)
                               .Set("score", scores[1].Text())
                               .Set("gameid",url)
                               .Set("status", "0")
                               .Set("jiashiscore1",jiashiawayscores1)
                               .Set("jiashiscore2",jiashiawayscores2)
                               .Set("jiashiscore3",jiashiawayscores3)
                               .Set("jiashiscore4",jiashiawayscores4)
                              );
        DataManager.AppendData("GAMESTATIC",
                               DataEntry.Create()
                               .Set("teamid", Regex.Match(logos[1].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value)
                               .Set("gametime",gametime.Text().Replace("开始比赛",""))
                               .Set("score1",homescore1)
                               .Set("score2", homescore2)
                               .Set("score3",homescore3)
                               .Set("score4",homescore4)
                               .Set("score", scores[2].Text())
                               .Set("gameid",url)
                               .Set("status", "1")
                               .Set("jiashiscore1",jiashihomescores1)
                               .Set("jiashiscore2",jiashihomescores2)
                               .Set("jiashiscore3",jiashihomescores3)
                               .Set("jiashiscore4",jiashihomescores4)
                              );


    }

}

这里的亮点是要看48, 49两行, 这里对加时赛也进行了处理. 不是所有的比赛都有加时赛, 就算有也可以打多场(1-4场). 因此甜瓜非常细心的对这块也做了处理. 个人感觉这块代码也还是有优化的余地, 但是这种处理也非常简单直白, 一目了然, 也是很不错的. 

最后运行起来:

 

 

文中开发工具Spider Studio (采集工作站)下载地址: http://www.gdtsearch.com/products.spiderstudio.htm. 安装后运行, 将脚本复制进去点"运行"即可看到效果. 

Spider Studio QQ群: 45995410

原文地址:https://www.cnblogs.com/iamzyf/p/3446852.html