提取网页链接

package com.zyw.regex;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class TestRegularExpression {
    public static void main(String[] args) {
        Map<UrlContent,Boolean> firstallUrl=new LinkedHashMap();
        Map<UrlContent,Boolean> secondallUrl=new LinkedHashMap();
        Pattern p=Pattern.compile("<a.*?href=["']?((https?://)?/?[^"']+)["']?.*?>(.+)</a>");//匹配整个<a></a>
        Pattern p1=Pattern.compile("(https?|ftp|http)://[a-zA-Z0-9]*.[a-zA-Z0-9]+.\w{2,3}/[\w\d-/.]*(?=")");//匹配url
        Pattern p2=Pattern.compile("(?<=>)[\w\su4e00-u9fa5]*(?=</a>)");//匹配<a></a>中内容
        addUrl(firstallUrl, "http://www.qq.com/", p, p1, p2);
        for (Iterator it = firstallUrl.keySet().iterator(); it.hasNext();) {
            UrlContent key = (UrlContent) it.next();
            addUrl(secondallUrl, key.getUrl(), p, p1, p2);
            if (secondallUrl.size() > 1000)
                break;
        }
        int i = 0;
        for (UrlContent key : secondallUrl.keySet()) {
            System.out.println(++i + " " + key.getUrl() + " -----"+ key.getContent());
        }
}

    public static void addUrl(Map<UrlContent, Boolean> allUrl,String link, Pattern p,Pattern p1, Pattern p2) { 
        try {
            URL url = new URL(link);
            InputStream in = url.openStream();
            InputStreamReader isr = new InputStreamReader(in, "utf-8");
            BufferedReader br = new BufferedReader(isr);
            String s = "";
            while ((s = br.readLine()) != null) {
                Matcher m=p.matcher(s);
                while (m.find()){
                    UrlContent content=new UrlContent();
                    String text=m.group();
                    Matcher m1=p1.matcher(text);
                    Matcher m2=p2.matcher(text);
                    while (m1.find()){
                        content.setUrl(m1.group());
                    }
                    while (m2.find()){
                        content.setContent(m2.group());
                    }
                    if(content.getUrl()!=null)
                    allUrl.put(content, false);
                }
                s = br.readLine();
            }
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

}
package com.zyw.regex;

public class UrlContent {
    private String url;
    private String content;
    public String getUrl() {
        return url;
    }
    public void setUrl(String url) {
        this.url = url;
    }
    public String getContent() {
        return content;
    }
    public void setContent(String content) {
        this.content = content;
    }
    
}
原文地址:https://www.cnblogs.com/yunwuzhan/p/5454100.html