微博转发关系采集,可拓展关键字采集,评论采集(Java版)

微博模拟登录获取cookis,配置采集深度,采集一条微博转发关系页面,同时解析页面,生成一条微博的传播图,数据集可做微博影响力分析和传播分析

gitthub:https://github.com/czeze/WeiboCrwlZEZE

package main;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.net.MalformedURLException;
import java.util.List;
import java.util.Random;

import org.apache.http.client.CookieStore;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.util.Cookie;
import com.sun.jna.Native.ffi_callback;

import Util.FileWriteUtil;
import Util.Id2MidUtil;

/**
 * 
 * @ClassName: SourceWeiboCrawler
 * @Description: 一条微博信息采集,输入微博URL
 * @author Zeze
 * @date 2016年4月10日 上午10:39:50
 *
 */
public class WeiboCrawler2 {
    
    private static String Url = "http://weibo.com/5892492312/DpsRXpOyG?from=page_1005055892492312_profile&wvr=6&mod=weibotime&type=comment";
    private static int CrawlDeep = 18;// 采集深度
    private static int SleepTime = 1000;// 采集间隔时间
    private static int NumCookies=7;//cookies数目
    
    private static Logger logger = Logger.getLogger(WeiboCrawler2.class);
    private static String cookiePath = "F:/WeiBo/cookie/cookie.file";// cookie目录
    private static String outputpath = "F:/WeiBo/Data/";// 输出目录
    private static String destfile = "F:/WeiBo/Data/";// 采集保存目录
    private static int cnt = 0;

    public static void main(String[] args) {

        String mid = GetMid(Url);// D8hxnrQdM
        String uid = GetUid(Url);// 1713926427

        outputpath = outputpath + mid + "/";// 输出目录
        destfile = destfile + mid + "/msg" + mid + ".csv";// 采集保存目录
        
        File file2 = new File(outputpath);
        if (!file2.exists())
            file2.mkdirs();
        String wString = "tzmid,zid,zname,zmid,mid,ztext,zurl,zsource,zzan,ztime,deep";
        StringBuffer sBuilder = new StringBuffer();
        sBuilder.append(wString + "
");
        FileWriteUtil.WriteDocument(destfile, sBuilder.toString());

        // 获得页数
        int PageNum = GetPageNum(mid, uid, 0, CrawlDeep);

        for (int i = 1; i <= PageNum; i++) {
            
(mid, uid, Integer.toString(i),
0, CrawlDeep); try {// 采集间隔 Thread.sleep(SleepTime); } catch (InterruptedException e) { logger.error(e); return; } } } webClient.closeAllWindows(); return Num; } //转发内容 if (info.select("span[class=cmt]").text().equals("")) {// 不是转发的内容 text = info.select("span[class=ctt]").text();// 正文 } // 时间 Elements time = info.select("span[class=ct]"); //转发 Elements rt = doc.select("div").select("span[id=rt]"); //评论 Elements ct = doc.select("div").select("span:contains(评论)"); // Elements zan = doc.select("div").select("span:contains(赞)"); text = text.trim(); name = BoZhu.get(0).text().trim(); timeStr = time.text().trim(); ZhuanFaNum = rt.text().trim().substring(2).replace("[", "").replace("]", ""); PinlunNum = ct.text().trim().substring(3).replace("[", "").replace("]", ""); zanNum = zan.get(0).text().trim().substring(2).replace("[", "").replace("]", ""); if (!rt.text().contains("["))//判断是否有转发 ZhuanFaNum = "0"; if (!ct.text().contains("[")) PinlunNum = "0"; System.out.println("英文消息ID: " + mid); System.out.println("数字消息ID: " + Nummid); System.out.println("用户ID: " + uid); System.out.println("博主: " + name); System.out.println("正文内容: " + text); String zname = null;// 转发的用户名 String zid = null;// 转发的用户ID String zzan = null;// 点赞数 String zmid = null;// 转发的消息id String ztime = null;// 转发时间 String zsource = null;// 来源 String ztext = null;// 转发的内容 String zurl = null; if (!rt.text().contains("[")) { System.out.println("没有转发"); try {// 采集间隔1s Thread.sleep(SleepTime); } catch (InterruptedException e) { logger.error(e); } return; } if (doc.select("[id=pagelist]").text().contains("页")) {//转发页数 String pnum = doc.select("[id=pagelist]").get(0).text(); pnum = pnum.substring(pnum.indexOf("/") + 1).replace("页", ""); System.out.println("转发页数:" + pnum); } for (Element result : RTList) {// 解析列表 // 点赞数 zzan = result.select("span[class=cc]").text(); if (zzan.equals("")) {// 过滤没有点赞标签 continue; } zzan = zzan.trim().substring(1).replace("[", "").replace("]", ""); if (result.select("a").size() > 0) { zname = result.select("a").get(0).text();// 转发的用户名 zid = result.select("a").get(0).toString();// 转发的用户id if (zid.indexOf("u") == 10) {// 正常的用户id zid = zid.substring(zid.indexOf("">") - 10, zid.indexOf("">")); } else { zid = zid.substring(zid.indexOf("/") + 1, zid.indexOf("">")); } } if (result.text().contains("查看更多热门")) continue; // 转发时间和来源 String tmp = result.select("span[class=ct]").text(); ztime = tmp.substring(0, tmp.indexOf("来自")); zsource = tmp.substring(tmp.indexOf("来自") + 2); text.indexOf("赞")); } zurl = "http://weibo.cn/repost/" + zmid + "?uid=" + zid; String tzmid = new Id2MidUtil().Uid2Mid(zmid); // 消息ID,用户ID,用户名,屏幕名,转发消息ID,消息内容,消息URL,来源,赞数,发布时间,层数 // tzmid,zid,zname,zmid,mid,ztext,zurl,zsource,zzan,ztime,deep cnt++; String wString = tzmid + "," + zid + "," + zname + "," + zmid + "," + Nummid + "," + ztext + "," + zurl + "," + zsource + "," + zzan + "," + ztime + "," + deep; System.out.println(cnt + ":" + wString); StringBuffer sBuilder = new StringBuffer(); sBuilder.append(wString + " "); FileWriteUtil.WriteDocument(destfile, sBuilder.toString()); } } } System.out.println("采集到的转发数目:" + cnt); return; } // 解析第二页开始 public static void parserPage(HtmlPage page, String mid, String uid, int deep, int crawldeep) { String html = page.getWebResponse().getContentAsString(); Document doc = Jsoup.parse(html); String zname = null;// 转发的用户名 String zid = null;// 转发的用户ID String zzan = null;// 点赞数 String zmid = null;// 转发的消息id String ztime = null;// 转发时间 String zsource = null;// 来源 String ztext = null;// 转发的内容 String zurl = null; mid = new Id2MidUtil().Uid2Mid(mid);// 消息ID Elements RTList = doc.select("div[class =c]"); for (Element result : RTList) { // 点赞数 zzan = result.select("span[class=cc]").text(); if (zzan.equals("")) {// 过滤没有点赞标签 continue; } zzan = zzan.trim().substring(1).replace("[", "").replace("]", ""); if (result.select("a").size() > 0) { zname = result.select("a").get(0).text();// 转发的用户名 zcontinue; // 转发时间和来源 String tmp = result.select("span[class=ct]").text(); ztime = tmp.substring(0, tmp.indexOf("来自")); zsource = tmp.substring(tmp.indexOf("来自") + 2); // 转发的消息id zmid = result.select("span[class=cc]").toString(); zmid = zmid.substring(zmid.indexOf("attitude") + 9, zmid.indexOf("attitude") + 18); // 转发的内容 ztext = result.text(); if (ztext.contains("//@")) { ztext = ztext.substring(ztext.indexOf(":") + 1, ztext.indexOf("//@")); } else if(ztext.contains(":")&&ztext.contains("赞")){ ztext = ztext.substring(ztext.indexOf(":") + 1, ztext.indexOf("赞")); } deep; System.out.println(cnt + ":" + wString); StringBuffer sBuilder = new StringBuffer(); sBuilder.append(wString + " "); FileWriteUtil.WriteDocument(destfile, sBuilder.toString()); if (deep < crawldeep) {// 采集深度 int PageNum = GetPageNum(zmid, zid, deep + 1, crawldeep); for (int i = 1; i <= PageNum; i++) { System.out.println("当前采集深度"+deep); (zmid, zid, Integer.toString(i), deep + 1, crawldeep); try {// 采集间隔1s Thread.sleep(SleepTime); } catch (InterruptedException e) { logger.error(e); return; } } } } System.out.println("采集到的转发数目:" + cnt); return; } // 写入文件 public static void SavePage(HtmlPage page, String path) { File file2 = null; file2 = new File(path); if (file2.exists()) logger.warn("outfile exit!"); else { FileOutputStream outputStream; try { outputStream = new FileOutputStream(file2); outputStream.write(page.getWebResponse().getContentAsString().getBytes()); outputStream.close(); } catch (FileNotFoundException e) { logger.error(e); } catch (IOException e) { logger.error(e); } } } // 随机获取cookies public static CookieStore GetCookieStore() { CookieStore cookieStore = null; File file = new File(cookiePathAppendRandom()); if (file.exists()) { FileInputStream fin = null; ObjectInputStream in; try { in = new ObjectInputStream(fin); cookieStore = (CookieStore) in.readObject(); in.close(); } catch (IOException e) { logger.error(e); } catch (ClassNotFoundException e) { logger.error(e); } } else { logger.warn("CookiePath doesn`t exit !!!"); } return cookieStore; } private static String cookiePathAppendRandom() { Random random = new Random(); return cookiePath + random.nextInt(NumCookies); } // http://weibo.com/1713926427/D8hxnrQdM?type=repost#_rnd1460261627854 public static String GetUid(String url) { int index = url.indexOf("weibo.com") + 10; } public static String GetMid(String url) { int index = url.indexOf("weibo.com") + 21; return url.substring(index, index + 9); } }
原文地址:https://www.cnblogs.com/zeze/p/5381440.html