使用selenium做简单爬虫的实例

selenium 是一个Web自动化测试的软件包,可以用于自动测试Web应用,也可以用于当作简单的爬虫制作工具,

这是一个简单的demo,用于爬取Google APP Store中的一个类别:

# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from time import sleep
import sqlite3
import sys

# connect the sqlite3 

def Conn_DB(db_name = 'app_info.db'):
  try:
    conn = sqlite3.connect(db_name)
  except Exception, e:
    print "Conn Error ", e
  return conn

# get the category of the apps

def Get_Category(root_address):
  url_list = root_address.split('/')
  return url_list[-1].replace("?",' ').split(' ')[0]

# we have to login so that to get the info from every app

def Login_Google(browser, category_root_address):
  
  browser.get(category_root_address)

  # click to login
  login_link = browser.find_element_by_id('gb_70')
  webdriver.ActionChains(browser).move_to_element(login_link).click(login_link).perform()

  # input your email here
  email = browser.find_element_by_name('Email')
  # you should input your email here
  email.send_keys('') 

  # input your password here
  pwd = browser.find_element_by_name('Passwd')
  # you should input your password for your email here
  pwd.send_keys('')
  pwd.send_keys(Keys.RETURN)

  print 'Login Success'


# load the whole page and then return the number of the apps under the category

def Load_All_Apps(browser):

  # try to load the whole page to select want I want, the magic number 13 is based on the test
  
  for times in xrange(13):
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    sleep(2.5)
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.5);")
    sleep(2.5)
    print times

    # click the show more button to load more apps
    show_more_button = browser.execute_script("return document.querySelector('#show-more-button')['style']['cssText'];")
    if show_more_button != 'display: none;':
      browser.execute_script("document.querySelector('#show-more-button').click();")
      print 'click button'
    print show_more_button

  # to the bottom of the page
  browser.execute_script("window.scrollTo(0, 0);")

  number = browser.execute_script("return document.querySelectorAll('button.price').length;")
  print number
  
  return number

def Click_Install_Button(browser, category_root_address):
  get_permissions_code = """var permissions = document.querySelectorAll('.perm-description');
var precise_locaton = 'precise location (GPS and network-based)';
var approximate_location = 'approximate location (network-based)';
var ways = '';

for (var perm in permissions) {
	if (permissions[perm].innerHTML == precise_locaton) {
		ways += 'p';
	} else if (permissions[perm].innerHTML == approximate_location) {
		ways += 'a';
	}
}
return ways;"""

  # get all install button objects
  get_button_list_code = """return document.querySelectorAll('button.price');"""
  button_list = browser.execute_script(get_button_list_code)
  # print dir(button_list[0])
  # button_list.reverse()

  numbers_of_button = len(button_list)

  count = 0
  # index = 1
  sleep(3)

  #webdriver.ActionChains(browser).move_to_element(button_list[1]).click(button_list[1]).perform()
  #sleep(1)
  #browser.execute_script("document.querySelector('#purchase-cancel-button').click();")
  #webdriver.ActionChains(browser).move_to_element(button_list[3]).click(button_list[3]).perform()
  #sleep(1)
  #browser.execute_script("document.querySelector('#purchase-cancel-button').click();")
  
  category = Get_Category(category_root_address)

  get_app_address_code = """var app_address_list = document.querySelectorAll("h2 a");var list = [];
for (var i = 0; i < app_address_list.length; i++) {list.push(app_address_list[i]['href']);} return list;"""
  address_list = browser.execute_script(get_app_address_code)

  conndb = Conn_DB()
  db_cursor = conndb.cursor()

  number_of_i_want = 0

  insert_sql = u"""insert into app_info (categroy, name, link, get_geo_ways) values ('{0}', '{1}', '{2}', '{3}')"""

  for index in range(1, numbers_of_button, 2):
    try:
      webdriver.ActionChains(browser).move_to_element(button_list[index]).click(button_list[index]).perform()
      sleep(3.5)
      count += 1
      #index += 2
    except IndexError:
      print "Out of index"
      break
    
    try:
      print "Count ", count
      perms = browser.execute_script(get_permissions_code)
      sleep(2)
      appname = browser.execute_script("return document.querySelector('.purchase-header .title').innerHTML;")
      print u"App id is: ", appname , u"Perm is: ", perms, u"Address is: ", address_list[count - 1]
      
      if perms:
        sql_with_data = insert_sql.format(category, appname, address_list[count - 1], perms)
        db_cursor.execute(sql_with_data)
        conndb.commit()
        number_of_i_want += 1
        
    except Exception, e:
      print "Error for ", e, "Number is ", count, "Pers is", perms
      continue
    # click cancle button
    browser.execute_script("document.querySelector('#purchase-cancel-button').click();")
    sleep(1)

  print "compary ", count , numbers_of_button, "I want :", number_of_i_want
  db_cursor.close()
  conndb.close()
  # print browser.execute_script()

if __name__ == '__main__':
  root_address = 'https://play.google.com/store/apps/category/TRAVEL_AND_LOCAL?hl=en'
  
  driver = webdriver.Chrome()
  Login_Google(driver, root_address)
  Load_All_Apps(driver)
  Click_Install_Button(driver, root_address)

  #sys.exit()

  fd = file("./res.txt", "wb")
  fd.write("over")
  fd.close()

  

原文地址:https://www.cnblogs.com/jaw-crusher/p/3669387.html