groovy java 下载中文乱码网页办法

GAE-Google App Engine网址抓取(java.net.UrlConnection)

URL url = new URL("http://fatkun.com"); // 读取源码 //读取中文时,使用Reader类是每次读出两个字节的,不会出现中文乱码 InputStreamReader in = new InputStreamReader(url.openStream(), "UTF-8"); char[] buf = new char[2048];//缓存 StringBuffer sb = new StringBuffer(); int len = 0; while ((len = in.read(buf)) != -1) {//当没到文档尽头继续读取 sb.append(buf, 0, len); }




import groovy.json.JsonSlurper;
import java.util.regex.*;
import sun.net.www.protocol.http.HttpURLConnection;
System.properties.putAll( ["http.proxyHost":"10.10.243.140", "http.proxyPort":"808"] )




        URL url = new URL("http://detail.tmall.com/item.htm?spm=a220z.1000880.0.44&id=16761700638");
        // 读取源码
        //读取中文时,使用Reader类是每次读出两个字节的,不会出现中文乱码
        InputStreamReader _in = new InputStreamReader(url.openStream(), "GBK");
        char[] buf = new char[2048];//缓存
        StringBuffer sb = new StringBuffer();
        int len = 0;
        while ((len = _in.read(buf)) != -1) {//当没到文档尽头继续读取
            sb.append(buf, 0, len);
        }
def html=sb.toString()
print html
def out = new File('/home/mlzboy/aaa.html')
out.append html

def cut(ohtml,start=null,end=null)
{
    def html=ohtml
    if(html==null || html.trim().length()==0 )
        return null
    if(start!=null)
    {
        def s=html.indexOf(start)

        if(s==-1)
            return null
        else
            html=html[s+start.length()..-1]
    }

    if(end!=null)
    {
        def e=html.indexOf(end)
        if(e==-1)
            return null
        else
            html=html[0..e-1]
    }
    return html
}

        def r=cut(html,"货号:&nbsp;","</li>")
        if (r!=null && r.size()>0)
        {
            println r
        }
        else
        {
            println "haah"
        }
原文地址:https://www.cnblogs.com/lexus/p/2636370.html