解析selenium http://blog.csdn.net/java2000_net/article/details/3721706
- package com.laozizhu.apache.httpclient;
- import java.net.Socket;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.apache.http.ConnectionReuseStrategy;
- import org.apache.http.HttpHost;
- import org.apache.http.HttpResponse;
- import org.apache.http.HttpVersion;
- import org.apache.http.impl.DefaultConnectionReuseStrategy;
- import org.apache.http.impl.DefaultHttpClientConnection;
- import org.apache.http.message.BasicHttpRequest;
- import org.apache.http.params.BasicHttpParams;
- import org.apache.http.params.HttpParams;
- import org.apache.http.params.HttpProtocolParams;
- import org.apache.http.protocol.BasicHttpContext;
- import org.apache.http.protocol.BasicHttpProcessor;
- import org.apache.http.protocol.ExecutionContext;
- import org.apache.http.protocol.HttpContext;
- import org.apache.http.protocol.HttpRequestExecutor;
- import org.apache.http.protocol.RequestConnControl;
- import org.apache.http.protocol.RequestContent;
- import org.apache.http.protocol.RequestExpectContinue;
- import org.apache.http.protocol.RequestTargetHost;
- import org.apache.http.protocol.RequestUserAgent;
- import org.apache.http.util.EntityUtils;
- public class HttpGet {
- public static void main(String[] args) throws Exception {
- HttpParams params = new BasicHttpParams();
-
- HttpProtocolParams.setVersion(params, HttpVersion.HTTP_1_1);
-
- HttpProtocolParams.setContentCharset(params, "UTF-8");
-
-
-
-
-
-
-
-
- HttpProtocolParams.setUserAgent(params, "HttpComponents/1.1");
- HttpProtocolParams.setUseExpectContinue(params, true);
- BasicHttpProcessor httpproc = new BasicHttpProcessor();
- httpproc.addInterceptor(new RequestContent());
- httpproc.addInterceptor(new RequestTargetHost());
- httpproc.addInterceptor(new RequestConnControl());
- httpproc.addInterceptor(new RequestUserAgent());
- httpproc.addInterceptor(new RequestExpectContinue());
- HttpRequestExecutor httpexecutor = new HttpRequestExecutor();
- HttpContext context = new BasicHttpContext(null);
- HttpHost host = new HttpHost("vote.csdn.net", 80);
- DefaultHttpClientConnection conn = new DefaultHttpClientConnection();
- ConnectionReuseStrategy connStrategy = new DefaultConnectionReuseStrategy();
- context.setAttribute(ExecutionContext.HTTP_CONNECTION, conn);
- context.setAttribute(ExecutionContext.HTTP_TARGET_HOST, host);
- System.out.println("<table>");
- try {
-
- for (int i = 1; i <= 85; i++) {
- if (!conn.isOpen()) {
- Socket socket = new Socket(host.getHostName(), host.getPort());
- conn.bind(socket, params);
- }
- BasicHttpRequest request = new BasicHttpRequest("GET",
- "http://vote.csdn.net/VoteList.aspx?page=" + i);
- context.setAttribute(ExecutionContext.HTTP_REQUEST, request);
- request.setParams(params);
- httpexecutor.preProcess(request, httpproc, context);
- HttpResponse response = httpexecutor.execute(request, conn, context);
- response.setParams(params);
- httpexecutor.postProcess(response, httpproc, context);
-
- if (response.getStatusLine().getStatusCode() != 200) {
- break;
- }
- parseData(EntityUtils.toString(response.getEntity()));
- if (!connStrategy.keepAlive(response, context)) {
- conn.close();
- }
- }
- } finally {
- conn.close();
- }
- System.out.println("</table>");
- }
- static final Pattern p = Pattern
- .compile(
- "<h4>.*?<a href=.*?voteid=(//d+)/">(.*?)</a></h4>.*?发起人:<a href=.*?>(.*?)</a>.*?<a href=.*?>(//d+) 人投票</a>",
- Pattern.DOTALL);
-
- public static void parseData(String msg) {
- String[] parts = msg.split("div class=/"kimi_modifysty/">");
- Matcher m;
- for (String s : parts) {
- m = p.matcher(s);
- if (m.find()) {
- System.out.println("<tr><td>" + m.group(1)
- + "</td><td><a href='http://vote.csdn.net/VotePost.aspx?voteid=" + m.group(1) + "'>"
- + m.group(2).replace(",", ",") + "</a></td><td>" + m.group(3) + "</td><td>"
- + m.group(4) + "</td></tr>");
- }
- }
- }
- }
原文地址:https://www.cnblogs.com/wjy123/p/7661572.html