2016

package Demo;

import java.net.URL;
import java.util.ArrayList;
import java.util.Scanner;

public class WebCrawler {

public static void main(String[] args) {
Scanner in = new Scanner(System.in);
System.out.println("Please enter a url");
String url = in.nextLine();
crawler(url);
}

public static void crawler(String url) {
ArrayList<String> listOne = new ArrayList<>();
ArrayList<String> listTwo = new ArrayList<>();
listOne.add(url);
while (!listOne.isEmpty() && listTwo.size() <= 10) {
String s1 = listOne.remove(0);
if (!listTwo.contains(s1)) {
listTwo.add(s1);
System.out.println(s1);
}
for (String s : getSubstring(s1)) {
if (!listTwo.contains(s))
listOne.add(s);
}
}
}

public static ArrayList<String> getSubstring(String url) {
ArrayList<String> list = new ArrayList<>();
try {
URL urlone = new URL(url);
Scanner input = new Scanner(urlone.openStream());
int current = 0;
while (input.hasNext()) {
String line = input.nextLine();
current = line.indexOf(""http:", current);
while (current > 0) {
int endIndex = line.indexOf(""", current+1);
if (endIndex > 0) {
list.add(line.substring(current+1, endIndex));
current = line.indexOf(""http:", endIndex);
} else
current = -1;
}
}
} catch (Exception ex) {
ex.printStackTrace();
}
return list;
}

}

原文地址:https://www.cnblogs.com/laigaoxiaode/p/5562410.html