pandas习题100道

#1
import pandas as pd
#2
pd.__version__

'1.0.5'

#3
pd.show_versions()

INSTALLED VERSIONS
------------------
commit           : None
python           : 3.8.3.final.0
python-bits      : 64
OS               : Darwin
OS-release       : 19.6.0
machine          : x86_64
processor        : i386
byteorder        : little
LC_ALL           : None
LANG             : zh_CN.UTF-8
LOCALE           : zh_CN.UTF-8

pandas           : 1.0.5
numpy            : 1.18.5
pytz             : 2020.1
dateutil         : 2.8.1
pip              : 20.1.1
setuptools       : 49.2.0.post20200714
Cython           : 0.29.21
pytest           : 5.4.3
hypothesis       : None
sphinx           : 3.1.2
blosc            : None
feather          : None
xlsxwriter       : 1.2.9
lxml.etree       : 4.5.2
html5lib         : 1.1
pymysql          : 0.10.1
psycopg2         : None
jinja2           : 2.11.2
IPython          : 7.16.1
pandas_datareader: None
bs4              : 4.9.1
bottleneck       : 1.3.2
fastparquet      : None
gcsfs            : None
lxml.etree       : 4.5.2
matplotlib       : 3.2.2
numexpr          : 2.7.1
odfpy            : None
openpyxl         : 3.0.4
pandas_gbq       : None
pyarrow          : None
pytables         : None
pytest           : 5.4.3
pyxlsb           : None
s3fs             : None
scipy            : 1.5.0
sqlalchemy       : 1.3.18
tables           : 3.6.1
tabulate         : None
xarray           : None
xlrd             : 1.2.0
xlwt             : 1.3.0
xlsxwriter       : 1.2.9
numba            : 0.50.1

#4创建一个DataFrame(df)，用data做数据，labels做行索引
import numpy as np
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
      'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
      'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
      'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

df = pd.DataFrame(data,index=labels)
df

	animal	age	visits	priority
a	cat	2.5	1	yes
b	cat	3.0	3	yes
c	snake	0.5	2	no
d	dog	NaN	3	yes
e	dog	5.0	2	no
f	cat	2.0	3	no
g	snake	4.5	1	no
h	cat	NaN	1	yes
i	dog	7.0	2	no
j	dog	3.0	1	no

#5显示有关此df及其数据的基本信息的摘要
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   animal    10 non-null     object 
 1   age       8 non-null      float64
 2   visits    10 non-null     int64  
 3   priority  10 non-null     object 
dtypes: float64(1), int64(1), object(2)
memory usage: 400.0+ bytes

	age	visits
count	8.000000	10.000000
mean	3.437500	1.900000
std	2.007797	0.875595
min	0.500000	1.000000
25%	2.375000	1.000000
50%	3.000000	2.000000
75%	4.625000	2.750000
max	7.000000	3.000000

#6查看此df的前三行数据
df.head(3)

	animal	age	visits	priority
a	cat	2.5	1	yes
b	cat	3.0	3	yes
c	snake	0.5	2	no

#7选择df中列标签为animal和age的数据
df1 = df[['animal','age']]
df1

	animal	age
a	cat	2.5
b	cat	3.0
c	snake	0.5
d	dog	NaN
e	dog	5.0
f	cat	2.0
g	snake	4.5
h	cat	NaN
i	dog	7.0
j	dog	3.0

#8选择行为[3, 4, 8]，且列为['animal', 'age']中的数据
#本题的难点在于当行索引被命名为非数字时不能再使用loc按照数字取值,如果使用iloc就不能用名称取列,而本题的要求是按数字取行,按名称取列
#解法一:分两次取值
df2 = df.iloc[[3,4,8],:][['animal', 'age']]
#解法二:取行索引然后按索引取
df2 = df.loc[df.index[[3,4,8]],['animal', 'age']]
#错误用法:
# df2 = df.loc[[3,4,8],['animal', 'age']]
df2

	animal	age
d	dog	NaN
e	dog	5.0
i	dog	7.0

#9选择visuts大于2的动物种类
df
bool_s=df.visits>2
df[bool_s]['animal']

b    cat
d    dog
f    cat
Name: animal, dtype: object

#10选择age为缺失值的行
df[df.age.isnull()]
df[df['age'].isnull()]

	animal	age	visits	priority
d	dog	NaN	3	yes
h	cat	NaN	1	yes

#11选择animal为cat，且age小于3的行
df[(df['animal']=='cat') & (df['age']<3)]

	animal	age	visits	priority
a	cat	2.5	1	yes
f	cat	2.0	3	no

#12选择age在2到4之间的数据(包含边界值)
df[(df['age']>=2) &(df['age']<=4)]

	animal	age	visits	priority
a	cat	2.5	1	yes
b	cat	3.0	3	yes
f	cat	2.0	3	no
j	dog	3.0	1	no

#13将f行的age改为1.5
# df.loc['f']['age']=1.5 这个操作不规范会警报
df.loc['f','age']=1.5

#14计算visits列的数据总和
res = df['visits'].sum()
res

#15计算每种animal的平均age
df.groupby('animal')['age'].mean()

animal
cat      2.333333
dog      5.000000
snake    2.500000
Name: age, dtype: float64

#16追加一行(k)，列的数据自定义，然后再删除新追加的k行

df.loc['k']=df.loc['a'].values
df.drop('k',inplace=True) #删除行
# del df['k'] #删除列
df

	animal	age	visits	priority
a	cat	2.5	1	yes
b	cat	3.0	3	yes
c	snake	0.5	2	no
d	dog	NaN	3	yes
e	dog	5.0	2	no
f	cat	2.0	3	no
g	snake	4.5	1	no
h	cat	NaN	1	yes
i	dog	7.0	2	no
j	dog	3.0	1	no

#17计算每种animal的个数(cat有几个，dog几个...)
df.groupby('animal').size()
df['animal'].value_counts()
# df['animal'].unique() #查看动物的种类
# df['animal'].nunique() #查看动物种类的个数

dog      4
cat      4
snake    2
Name: animal, dtype: int64

#18先根据age降序排列，再根据visits升序排列
df.sort_values(by=['age','visits'],ascending=[False,True])

	animal	age	visits	priority
i	dog	7.0	2	no
e	dog	5.0	2	no
g	snake	4.5	1	no
j	dog	3.0	1	no
b	cat	3.0	3	yes
a	cat	2.5	1	yes
f	cat	2.0	3	no
c	snake	0.5	2	no
h	cat	NaN	1	yes
d	dog	NaN	3	yes

#19将priority列的yes和no用True和False替换
#这题似乎有点问题,不同类型的数据类型似乎不能相互转换
df['priority'] = df['priority'].astype(bool).map({'yes': True, 'no': False})
df

	animal	age	visits	priority
a	cat	2.5	1	NaN
b	cat	3.0	3	NaN
c	snake	0.5	2	NaN
d	dog	NaN	3	NaN
e	dog	5.0	2	NaN
f	cat	2.0	3	NaN
g	snake	4.5	1	NaN
h	cat	NaN	1	NaN
i	dog	7.0	2	NaN
j	dog	3.0	1	NaN

#20 将animal列的snake用python替换
df['animal'].replace('snake','python',inplace=True)
df

	animal	age	visits	priority
a	cat	2.5	1	NaN
b	cat	3.0	3	NaN
c	python	0.5	2	NaN
d	dog	NaN	3	NaN
e	dog	5.0	2	NaN
f	cat	2.0	3	NaN
g	python	4.5	1	NaN
h	cat	NaN	1	NaN
i	dog	7.0	2	NaN
j	dog	3.0	1	NaN

#21对于每种动物类型和每种访问次数，求出平均年龄。换句话说，每一行都是动物，每一列都是访问次数，其值是平均年龄(提示：使用数据透视表)
# df.groupby(['animal','visits'])['age'].mean().reset_index()
df.pivot_table(index='animal', columns='visits', values='age', aggfunc='mean')

visits	1	2	3
animal
cat	2.5	NaN	2.5
dog	3.0	6.0	NaN
python	4.5	0.5	NaN