网页行为分析

去掉重复的id,并且存入数据库:

import MySQLdb
# 打开数据库连接
db = MySQLdb.connect(host='localhost', user='root', passwd='123456', port=3306, charset="utf8", db="db_websiterecommend")
cur = db.cursor()
# db = MySQLdb.connect(host="localhost",user="root",passwd="123456",db="db_websiterecommend",port='3306' )
# # 使用cursor()方法获取操作游标
# cursor = db.cursor()
# 使用execute方法执行SQL语句
sql='select ip from t_useripvisittrace'

cur.execute(sql)
# 使用 fetchone() 方法获取一条数据库。
data = cur.fetchall()
#print(data)
user_ip=[]
for l in data:
    user_ip.append(l[0])
#print(type(user_ip))
#print(user_ip)
# 关闭数据库连接
user_ip_cai=set(user_ip)
# print(user_ip_cai)
userip=list(user_ip_cai)
value=[]
for i in range(len(userip)):
    value.append((i,userip[i]))
cur.executemany('insert into t_userIP_list values(%s,%s)',value)
db.commit()

 根据网站情况,进行网站的分类:

# -*- coding: utf-8 -*-
import MySQLdb
import re
import requests
from lxml import etree
# 打开数据库连接
count=0
url='http://www.tipdm.org'
db = MySQLdb.connect(host='localhost', user='root', passwd='123456', port=3306, charset="utf8", db="db_websiterecommend")
cur = db.cursor()
cur1=db.cursor()
sql='select page_path from t_useripvisittrace'
sql1='select url_list from urls'
cur.execute(sql)
cur1.execute(sql1)
# 使用 fetchone() 方法获取一条数据库。
value=[]
data = cur.fetchall()
print(len(data))
for each in data:
    #print(type(each[0]))
    if each[0]=='/':
        print('2222')
        value.append((each[0],'首页'))
        cur.executemany('insert into t_url_classify values(%s,%s)', value)
        db.commit()
        count+=1
        print(count)
        #print(value)
    elif each[0]=='/index.jhtml':
        print('3333')
        value.append((each[0],'首页'))
        cur.executemany('insert into t_url_classify values(%s,%s)', value)
        db.commit()
        count += 1
        print(count)
        #print(value)
    elif 'index' in each[0]:
        print('4444')
        urls=url+each[0]
        html = requests.get(urls)
        selector = etree.HTML(html.text)
        content=selector.xpath('/html/body/div[6]/div[2]/div[1]/div/a[2]/text()')
        value.append((each[0],content))
        cur.executemany('insert into t_url_classify values(%s,%s)', value)
        db.commit()
        count += 1
        print(count)
        #print(value)
    elif '.jhtml' in each[0]:
        print('5555')
        url1=url+each[0]
        html = requests.get(url1)
        selector = etree.HTML(html.text)
        content=selector.xpath('/html/body/div[5]/div[2]/div[1]/div[1]/a[2]/text()')
        value.append((each[0],content))
        cur.executemany('insert into t_url_classify values(%s,%s)', value)
        db.commit()
        count += 1
        print(count)
    else:
        print('666')
        value.append((each[0],'其他'))
        print(each[0])
        cur.executemany('insert into t_url_classify values(%s,%s)', value)
        db.commit()
        count += 1
        print(count)
print(value)
print('finish')

 使用pandas读取数据库进行统计

import pandas as pd
from sqlalchemy import create_engine
engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/db_websiterecommend?charset=utf8')
sql = pd.read_sql('t_useripvisittrace', engine, chunksize = 10000)
output='C:\Users\lenovo\Desktop\count_.xls'
'''z
用create_engine建立连接,连接地址的意思依次为“数据库格式(mysql)+程序名(pymysql)+账号密码@地址端口/数据库名(test)”,最后指定编码为utf8;
all_gzdata是表名,engine是连接数据的引擎,chunksize指定每次读取1万条记录。这时候sql是一个容器,未真正读取数据。
'''
import MySQLdb
# 打开数据库连接
db = MySQLdb.connect(host='localhost', user='root', passwd='123456', port=3306, charset="utf8", db="db_websiterecommend")
cur = db.cursor()
value=[]
# for j in range(len(list(sql))):
#     s=sql[j]['ip'].value_counts()
#     value.append((j,s))
#     print value
for i in sql:
    s=i['ip'].value_counts()
    print type(s)
    value.append((list(i['ip']),list(s)))
    cur.executemany('insert into userip values(%,%s)', value)
    db.commit()
print value
原文地址:https://www.cnblogs.com/caicaihong/p/5869653.html