用Java爬虫爬取凤凰财经提供的沪深A股所有股票代号名称

要爬取的凤凰财经网址:http://app.finance.ifeng.com/list/stock.php?t=hs

本作主要采用的技术是jsoup,相关介绍网页:https://www.jianshu.com/p/69b395bee43a

其官网:https://jsoup.org/

爬取程序:

package com.ufo.hy.agumaster.tool;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.ufo.hy.agumaster.entity.Stock;

/**
 * Crawl stock code/name from FengHuang finance website:http://app.finance.ifeng.com/list/stock.php?t=hs
 * Main package:jsoup
 * Dependency:
 *         <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.7.3</version>
        </dependency>
 * @author heyang
 *
 */
public class FenghuangCrawler {
    private static final String SRC_URL="http://app.finance.ifeng.com/list/stock.php?t=hs";
    private static final String ENCODING = "utf-8";
    
    // Used to save stock code names
    private List<Stock> stockList; 

    public FenghuangCrawler() {
        stockList=new ArrayList<Stock>();
        String url=SRC_URL;
        
        int idx=0;
        while(true) {
            System.out.println(url);
            
            String html = getUrlHtml(url,ENCODING);
            Document doc = Jsoup.parse(html,ENCODING);
            
            // Find core node
            Element divtab01 = doc.getElementsByClass("tab01").last();
            
            // Find stocks
            Elements trs=divtab01.getElementsByTag("tr");
            for(Element tr:trs) {
                Elements tds=tr.getElementsByTag("td");
                if(tds.size()>2) {
                    Element codeElm=tds.get(0).getElementsByTag("a").last();
                    Element nameElm=tds.get(1).getElementsByTag("a").last();
                    
                    Stock s=new Stock(idx++,codeElm.text(),nameElm.text());
                    stockList.add(s);
                }
            }
            
            // Find next page url
            Element lastLink=divtab01.getElementsByTag("a").last();            
            if(lastLink.text().equals("下一页")) {
                url="http://app.finance.ifeng.com/list/stock.php"+lastLink.attr("href");
            }else {
                break;
            }
        }    
        
        for(Stock s:stockList) {
            System.out.println(s);
        }
        System.out.println("共找到"+idx+"个股票.");
    }

    private String getUrlHtml(String url, String encoding) {
        StringBuffer sb = new StringBuffer();
        URL urlObj = null;
        URLConnection openConnection = null;
        InputStreamReader isr = null;
        BufferedReader br = null;
        try {
            urlObj = new URL(url);
            openConnection = urlObj.openConnection();
            isr = new InputStreamReader(openConnection.getInputStream(), encoding);
            br = new BufferedReader(isr);
            String temp = null;
            while ((temp = br.readLine()) != null) {
                sb.append(temp + "
");
            }
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if (isr != null) {
                    isr.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return sb.toString();
    }
    
    public List<Stock> getStockList() {
        return stockList;
    }

    public static void main(String[] args) {
        // 根据需要设置代理
        System.setProperty("http.proxyHost", "");
        System.setProperty("http.proxyPort", "");

        new FenghuangCrawler();
    }
}

运行结果节选:

...
Stock id:
3743 code:002752 name:昇兴股份 Stock id:3744 code:000796 name:凯撒旅业 Stock id:3745 code:603233 name:大参林 Stock id:3746 code:000048 name:京基智农 Stock id:3747 code:300463 name:迈克生物 Stock id:3748 code:300485 name:赛升药业 Stock id:3749 code:603387 name:基蛋生物 Stock id:3750 code:002469 name:三维工程 Stock id:3751 code:600052 name:浙江广厦 Stock id:3752 code:002187 name:广百股份 Stock id:3753 code:300069 name:金利华电 Stock id:3754 code:300317 name:珈伟新能 Stock id:3755 code:002637 name:赞宇科技 Stock id:3756 code:001914 name:招商积余 Stock id:3757 code:000564 name:供销大集 Stock id:3758 code:002363 name:隆基机械 Stock id:3759 code:603709 name:中源家居 Stock id:3760 code:000802 name:北京文化 Stock id:3761 code:002127 name:南极电商 Stock id:3762 code:600107 name:美尔雅 Stock id:3763 code:002678 name:珠江钢琴 Stock id:3764 code:002083 name:孚日股份 Stock id:3765 code:300325 name:德威新材 共找到3766个股票.

这是2020年5月1日的数据。

参考资料:

https://www.jianshu.com/p/3430f4d0b384
https://blog.csdn.net/qq_28940573/article/details/99295276

--2020-04-30--

原文地址:https://www.cnblogs.com/heyang78/p/12808381.html