package com.claw.util.html; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Charset { /** * @param args */ public static void main(String[] args) { List<String> list = new ArrayList<String>(); list.add("http://li200429.iteye.com/blog/1608758"); list.add("http://blog.csdn.net/vic0228/article/details/49634311"); list.add("http://www.zhihu.com/"); list.add("http://www.sohu.com/"); list.add("http://blog.163.com/wenchangqing_live/blog/static/173722309201182044545864/"); /* for (String url : list) { String html = getHTML(url); String title = getTitle(html); System.out.println("url:"+url+" ----- title:"+title); if(title.equals("")){ System.out.println(html); } }*/ } public static String getCharset(InputStream in){ String charset = "UTF-8"; BytesEncodingDetect s = new BytesEncodingDetect(); byte[] b = new byte[1024]; try { int length = in.read(b); String encode = BytesEncodingDetect.nicename[s.detectEncoding(b)]; if(encode.equals("GB-2312")){ encode = "GBK"; } /*if(encode.equals("ASCII")){ encode = "UTF-8"; }*/ charset = encode; } catch (Exception e) { e.printStackTrace(); } return charset; } /** * 404有问题 暂时停用 * @param urlStr * @return */ public static String getCharset(String urlStr) { String charset = "UTF-8"; URL url = null; BufferedInputStream in = null; try { url = new URL(urlStr); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); conn.setConnectTimeout(10000); conn.setRequestProperty("User-Agent", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)"); conn.connect(); int status = conn.getResponseCode(); System.out.println(status); if(status==200){ in = new BufferedInputStream(conn.getInputStream()); BytesEncodingDetect s = new BytesEncodingDetect(); StringBuffer sb = new StringBuffer(); byte[] b = new byte[1024]; int length = in.read(b); String encode = BytesEncodingDetect.nicename[s.detectEncoding(b)]; System.out.println("encode:" + encode); if(encode.equals("GB-2312")){ encode = "GBK"; } charset = encode; }else if(status==404){ } } catch (Exception e) { System.out.println(urlStr); e.printStackTrace(); } finally { if (in != null) try { in.close(); } catch (IOException e) { System.out.println(urlStr); e.printStackTrace(); } } return charset; } }