[读书笔记] Python数据分析 (二) 引言

 

1. 数据分析的任务:数据读写,数据准备(清洗,修整,规范化,重塑,切片切块,变形),转换,建模计算,呈现(模型/数据)

2. 数据集:

bit.ly的1.usa.gov数据:URL缩短服务bit.ly和美国政府usa.gov合作从.gov或.mil用户那里收集的匿名数据

# -*- coding:utf-8 -*-
#导入json模块,将json字符串转换为python字典
import json
from collections import defaultdict
from collections import Counter
from pandas import DataFrame, Series
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

path = "E:/Programming/Python/PythonDataAnalysis/datasets/usagov_bitly/example.txt"
#list comprehension
records = [json.loads(line) for line in open(path)]
#对时区计数, 同时保证tz必须在records中
time_zones = [rec['tz'] for rec in records if 'tz' in rec.keys()]
#--------------方法1:------------
#时区计数
def get_counts(sequence):
	counts = {}
	for x in sequence:
		if x in counts:
			counts[x] += 1
		else:
			counts[x] = 1
	return counts
#取得前n个最常使用的时区
def top_counts(count_dict,n = 10):
	value_key_pairs = [(count,tz) for tz, count in count_dict.items()]
	value_key_pairs.sort()
	return value_key_pairs[-n:]
counts = get_counts(time_zones)
print(counts)
top_counts = top_counts(counts)
print(top_counts)
#--------------方法2:------------
def get_counts2(sequence):
	counts = defaultdict(int)
	for x in sequence:
		counts[x] += 1
	return counts
#--------------方法3:------------
#引入collections的Counter对象
def get_counts3(time_zones,n=10):
	counts = Counter(time_zones)
	return counts.most_common(n)

top_counts3 = get_counts3(time_zones,10)
print(top_counts3)
#--------------方法3:------------
#用pandas对时区进行计数
#将records转换为DataFrame对象
frame = DataFrame(records)
#frame['tz']返回的对象有一个value_counts方法
tz_counts = frame['tz'].value_counts()
print(tz_counts[:10])
#fillna()函数填补空缺值NA
clean_tz = frame['tz'].fillna("Missing")
print(clean_tz)
#空字符串为Unknown
clean_tz[clean_tz == ''] = "Unknown"
tz_counts = clean_tz.value_counts()
print(tz_counts[:10])
#利用counts的plot方法
tz_counts[:10].plot(kind = "barh",rot=0)
plt.show()
#用户浏览器分析
results = Series([x.split()[0] for x in frame.a.dropna()])
#打印前8的浏览器
print(results.value_counts()[:8])
cframe = frame[frame.a.notnull()]
operating_system = np.where(cframe['a'].str.contains("Windows"),"Windows","Not Windows")
windows = 0
nonWindows = 0
for op in operating_system:
	if op == "Windows":
		windows += 1
	else:
		nonWindows += 1
print("windows:",windows,"nonWindows:",nonWindows)
#使用windows/nonwindows给时区分组
by_tz_os = cframe.groupby(['tz',operating_system])
agg_counts = by_tz_os.size().unstack().fillna(0)
print(agg_counts[:10])
#选取最常见的时区
indexer = agg_counts.sum(1).argsort()
print(indexer)
count_subset = agg_counts.take(indexer)[-10:]
print(count_subset)
#绘制windows/nonwindows 堆叠条形图
count_subset.plot(kind="barh",stacked=True)
#不加这句语句,在Ipython中可以显示但是脚本运行不显示
plt.show()
#规范化
normed_subset = count_subset.div(count_subset.sum(1),axis = 0)
normed_subset.plot(kind = "barh",stacked=True)
plt.show()

MovieLens 1M数据集:20世纪90年末到21世纪初6000名用户提供的4000部电影评分100万条数据,分为3个表:电影评分,电影元数据(类型,年代),用户的人口统计学数据(年龄,右边,性别,职业)

# -*- coding: utf-8 -*-
import pandas as pd 
import os
#数据读取,读成3个表
path = 'E:/Programming/Python/PythonDataAnalysis/datasets/movielens/'
unames = ['user_id','gender','age','occupation','zip']
upath = os.path.join(path,'users.dat')
users = pd.read_table(upath,sep = "::",header=None,names=unames,engine='python')
rnames = ['user_id',"movie_id","rating","timestamp"]
ratings = pd.read_table(path+'ratings.dat',sep = "::",header=None,names=rnames,engine='python')
mnames = ['movie_id','title','genres']
movies = pd.read_table(path+'movies.dat',sep ="::",header=None,names=mnames,engine='python')
#数据表整合
data = pd.merge(pd.merge(ratings,users),movies)
print(data[:10])
print(data.ix[0])
#按性别计算每部电影的得分,index 中是标签,columns中是列标签
mean_ratings = data.pivot_table('rating',index = 'title',columns = "gender",aggfunc='mean')
print(mean_ratings[:10])
#过滤掉评分不足250条的电影
ratings_by_title = data.groupby('title').size()
print(ratings_by_title[:10])
active_titles = ratings_by_title[ratings_by_title >= 250]
print(active_titles)
#按照评论>=250的index筛选
mean_ratings = mean_ratings.ix[active_titles.index]
top_female_ratings = mean_ratings.sort_index(by='F',ascending=False)
print(top_female_ratings[:10])
#计算男性女性得分分歧最大的电影
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']

sorted_by_diff = mean_ratings.sort_index(by = 'diff')
#分歧最大且女性更喜欢的电影
print(sorted_by_diff[:15])
#对结果反序取出前15行,男性观众更喜欢的电影
print(sorted_by_diff[::-1][:15])
#分歧最大的电影,计算方差或者标准差
rating_std_by_title = data.groupby('title')['rating'].std()
#使用active_title进行过滤
rating_std_by_title = rating_std_by_title.ix[active_titles]
rating_std_by_title.order(ascending=False)
print(rating_std_by_title[:15])

1880-2010年间婴儿名字频率数据

# -*- coding:utf-8 -*-
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
path = 'E:/Programming/Python/PythonDataAnalysis/datasets/babynames/'
names1880 = pd.read_csv(path+'yob1880.txt',names = ['name','sex','births'],engine='python')
#按照sex对数据进行简单分组
names1880.groupby('sex').births.sum()
#将单个文件中的数据整合到一个数据表中
years = range(1880,2011)
pieces = []
columns = ['name','sex','birth']
for year in years:
	subpath = 'yob%d.txt' % year
	frame = pd.read_csv(path+subpath,names = columns)
	frame['year'] = year
	pieces.append(frame)
names = pd.concat(pieces,ignore_index = True)
#使用pivot_table()函数进行聚合
total_births = names.pivot_table('birth',index = 'year',columns = 'sex',aggfunc = sum)
print(total_births.tail())
#插入prop列存放指定的婴儿数相对于总出生数的比例
def add_prop(group):
	births = group.birth.astype(float)
	group['prop'] = births/births.sum()
	return group

names = names.groupby(['year','sex']).apply(add_prop)
#取出每个sex/year组合的前1000个名字
def get_top1000(group):
	return group.sort_values(by='birth',ascending=False)[1:1000]
grouped = names.groupby(['year','sex'])
top1000 = grouped.apply(get_top1000)
#接下来的'命名趋势'分析针对这top1000个数据集
#取出男性
boys = top1000[top1000.sex == 'M']
#取出女性
girls = top1000[top1000.sex == 'F']
total_births = top1000.pivot_table('birth',index = 'year',columns = 'name',aggfunc = sum)
subset = total_births[['John','Harry','Mary','Marilyn']]
subset.plot(subplots = True,figsize = (12,10),grid=False,title = "Number of births per year")
plt.show()
#观察名字多样性变化
table = top1000.pivot_table('prop',index = 'year',columns = 'sex',aggfunc = sum)
table.plot(title = "sum of table1000.prop by year and sex",yticks = np.linspace(0,1.2,13),xticks = range(1880,2020,10))
plt.show()
# 名字最后一个字母的变化

  

原文地址:https://www.cnblogs.com/vincentcheng/p/7903179.html