爬虫验证码破解任务

之前爬取的网站中有部分需要验证码才能点击下载图片,所以查阅了一些破解验证码相关的资料;

思路:先将验证码图片下载到临时文件中,再用工具去破解,然后将破解的验证码通过selenium模拟输入到文本中再模拟点击完成;

附上代码:

//模拟点击免费下载
driver.findElement(By.id("detail_free_download_btn")).click();
try {
TimeUnit.SECONDS.sleep(1);
} catch (InterruptedException e) {
e.printStackTrace();
}
//检查是否跳出验证码
Document checkDoc = Jsoup.parse(driver.getPageSource());
if (checkDoc != null) {
Element verifyElement = checkDoc.select("div.visit-tc-main").first();
if (verifyElement != null) {//出现验证码
crackCaptcha(checkDoc, url);
//再次模拟点击免费下载
driver.findElement(By.id("detail_free_download_btn")).click();
try {
TimeUnit.SECONDS.sleep(1);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}

/**
* 破解验证码
* @param document
* @param accountUrl
* @return
*/
private void crackCaptcha(Document document, String accountUrl) {
if (document != null && document.text().contains("输入验证码")) {
File captchaFile = screenshotCaptcha(driver, driver.findElement(By.id("download_verify_code"))); //下载验证码图片到本地
String captcha = recognizeCaptcha(captchaFile);  //识别验证码
LOG.info("Recognizing captcha {}", captcha);
// 输入验证码
WebElement webElement = driver.findElement(By.id("download-verify-input-code"));
webElement.sendKeys(captcha);
// 提交按钮
driver.findElement(By.id("check-download-verify-code")).click();
try {
TimeUnit.SECONDS.sleep(2);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}

/**
* 快照验证码
* @param driver
* @param element
* @return
*/
private File screenshotCaptcha(WebDriver driver, WebElement element) {
File verifyFile = new File(System.getProperty("java.io.tmpdir")+"verify");
if (!verifyFile.exists() && !verifyFile.isDirectory()) {
verifyFile.mkdir();
}
File[] files = verifyFile.listFiles();
for (File file : files) {
file.delete();
}
File file = null;
File scrFile = null;
try {
String path = System.getProperty("java.io.tmpdir")+"verify/" + UUID.randomUUID().toString() + ".png";
file = new File(path);
scrFile = ((TakesScreenshot) driver).getScreenshotAs(OutputType.FILE);
Point p = element.getLocation();
int width = element.getSize().getWidth();
int height = element.getSize().getHeight();
Rectangle rect = new Rectangle(width, height);
BufferedImage img = ImageIO.read(scrFile);
BufferedImage dest = img.getSubimage(p.getX(), p.getY(), rect.width, rect.height);
ImageIO.write(dest, "png", scrFile);
Thread.sleep(1000);
FileUtils.copyFile(scrFile, file);
} catch (Exception e) {
e.printStackTrace();
}
return file;
}

@SuppressWarnings("all")
public String recognizeCaptcha(File file){
if(file==null){
return "";
}
String result = "";
try {

String base64Image = Base64.encodeBase64String(FileUtils.readFileToByteArray(file));
//http://api.jisuapi.com/captcha/recognize?appkey=yourappkey&type=n4            //通过极速数据这个平台提供识别验证码接口(当然是收费的)
String requestUrl = "http://api.jisuapi.com/captcha/recognize";
Map<String,String> parameters = Maps.newHashMap();
parameters.put("appkey", "*******");
parameters.put("type", "***");
parameters.put("pic",base64Image);

String response = WebUtil.post(requestUrl, parameters,5000,20000);
logger.info("Response is {}",response);
Map<String,Object> data = JsonUtil.fromJson(response, Map.class);
if(data!=null && data.containsKey("result")){
Map<String,Object> resultMap = (Map<String,Object>)data.get("result");
if(resultMap!=null && resultMap.containsKey("code")){
return (String) resultMap.get("code");
}
}
} catch (IOException e) {
e.printStackTrace();
}

return result;
}

原文地址:https://www.cnblogs.com/yzf666/p/7055460.html