pandas习题100道

#1
import pandas as pd
#2
pd.__version__
'1.0.5'
#3
pd.show_versions()
INSTALLED VERSIONS
------------------
commit           : None
python           : 3.8.3.final.0
python-bits      : 64
OS               : Darwin
OS-release       : 19.6.0
machine          : x86_64
processor        : i386
byteorder        : little
LC_ALL           : None
LANG             : zh_CN.UTF-8
LOCALE           : zh_CN.UTF-8

pandas           : 1.0.5
numpy            : 1.18.5
pytz             : 2020.1
dateutil         : 2.8.1
pip              : 20.1.1
setuptools       : 49.2.0.post20200714
Cython           : 0.29.21
pytest           : 5.4.3
hypothesis       : None
sphinx           : 3.1.2
blosc            : None
feather          : None
xlsxwriter       : 1.2.9
lxml.etree       : 4.5.2
html5lib         : 1.1
pymysql          : 0.10.1
psycopg2         : None
jinja2           : 2.11.2
IPython          : 7.16.1
pandas_datareader: None
bs4              : 4.9.1
bottleneck       : 1.3.2
fastparquet      : None
gcsfs            : None
lxml.etree       : 4.5.2
matplotlib       : 3.2.2
numexpr          : 2.7.1
odfpy            : None
openpyxl         : 3.0.4
pandas_gbq       : None
pyarrow          : None
pytables         : None
pytest           : 5.4.3
pyxlsb           : None
s3fs             : None
scipy            : 1.5.0
sqlalchemy       : 1.3.18
tables           : 3.6.1
tabulate         : None
xarray           : None
xlrd             : 1.2.0
xlwt             : 1.3.0
xlsxwriter       : 1.2.9
numba            : 0.50.1
#4创建一个DataFrame(df),用data做数据,labels做行索引
import numpy as np
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
      'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
      'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
      'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

df = pd.DataFrame(data,index=labels)
df
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 yes
e dog 5.0 2 no
f cat 2.0 3 no
g snake 4.5 1 no
h cat NaN 1 yes
i dog 7.0 2 no
j dog 3.0 1 no
#5显示有关此df及其数据的基本信息的摘要
df.info()
df.describe()
<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   animal    10 non-null     object 
 1   age       8 non-null      float64
 2   visits    10 non-null     int64  
 3   priority  10 non-null     object 
dtypes: float64(1), int64(1), object(2)
memory usage: 400.0+ bytes
age visits
count 8.000000 10.000000
mean 3.437500 1.900000
std 2.007797 0.875595
min 0.500000 1.000000
25% 2.375000 1.000000
50% 3.000000 2.000000
75% 4.625000 2.750000
max 7.000000 3.000000
#6查看此df的前三行数据
df.head(3)
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
#7选择df中列标签为animal和age的数据
df1 = df[['animal','age']]
df1
animal age
a cat 2.5
b cat 3.0
c snake 0.5
d dog NaN
e dog 5.0
f cat 2.0
g snake 4.5
h cat NaN
i dog 7.0
j dog 3.0
#8选择行为[3, 4, 8],且列为['animal', 'age']中的数据
#本题的难点在于当行索引被命名为非数字时不能再使用loc按照数字取值,如果使用iloc就不能用名称取列,而本题的要求是按数字取行,按名称取列
#解法一:分两次取值
df2 = df.iloc[[3,4,8],:][['animal', 'age']]
#解法二:取行索引然后按索引取
df2 = df.loc[df.index[[3,4,8]],['animal', 'age']]
#错误用法:
# df2 = df.loc[[3,4,8],['animal', 'age']]
df2
animal age
d dog NaN
e dog 5.0
i dog 7.0
#9选择visuts大于2的动物种类
df
bool_s=df.visits>2
df[bool_s]['animal']
b    cat
d    dog
f    cat
Name: animal, dtype: object
#10选择age为缺失值的行
df[df.age.isnull()]
df[df['age'].isnull()]
animal age visits priority
d dog NaN 3 yes
h cat NaN 1 yes
#11选择animal为cat,且age小于3的行
df[(df['animal']=='cat') & (df['age']<3)]
animal age visits priority
a cat 2.5 1 yes
f cat 2.0 3 no
#12选择age在2到4之间的数据(包含边界值)
df[(df['age']>=2) &(df['age']<=4)]
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
f cat 2.0 3 no
j dog 3.0 1 no
#13将f行的age改为1.5
# df.loc['f']['age']=1.5 这个操作不规范会警报
df.loc['f','age']=1.5
#14计算visits列的数据总和
res = df['visits'].sum()
res
19
#15计算每种animal的平均age
df.groupby('animal')['age'].mean()
animal
cat      2.333333
dog      5.000000
snake    2.500000
Name: age, dtype: float64
#16追加一行(k),列的数据自定义,然后再删除新追加的k行

df.loc['k']=df.loc['a'].values
df.drop('k',inplace=True) #删除行
# del df['k'] #删除列
df
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 yes
e dog 5.0 2 no
f cat 2.0 3 no
g snake 4.5 1 no
h cat NaN 1 yes
i dog 7.0 2 no
j dog 3.0 1 no
#17计算每种animal的个数(cat有几个,dog几个...)
df.groupby('animal').size()
df['animal'].value_counts()
# df['animal'].unique() #查看动物的种类
# df['animal'].nunique() #查看动物种类的个数
dog      4
cat      4
snake    2
Name: animal, dtype: int64
#18先根据age降序排列,再根据visits升序排列
df.sort_values(by=['age','visits'],ascending=[False,True])
animal age visits priority
i dog 7.0 2 no
e dog 5.0 2 no
g snake 4.5 1 no
j dog 3.0 1 no
b cat 3.0 3 yes
a cat 2.5 1 yes
f cat 2.0 3 no
c snake 0.5 2 no
h cat NaN 1 yes
d dog NaN 3 yes
#19将priority列的yes和no用True和False替换
#这题似乎有点问题,不同类型的数据类型似乎不能相互转换
df['priority'] = df['priority'].astype(bool).map({'yes': True, 'no': False})
df
animal age visits priority
a cat 2.5 1 NaN
b cat 3.0 3 NaN
c snake 0.5 2 NaN
d dog NaN 3 NaN
e dog 5.0 2 NaN
f cat 2.0 3 NaN
g snake 4.5 1 NaN
h cat NaN 1 NaN
i dog 7.0 2 NaN
j dog 3.0 1 NaN
#20 将animal列的snake用python替换
df['animal'].replace('snake','python',inplace=True)
df
animal age visits priority
a cat 2.5 1 NaN
b cat 3.0 3 NaN
c python 0.5 2 NaN
d dog NaN 3 NaN
e dog 5.0 2 NaN
f cat 2.0 3 NaN
g python 4.5 1 NaN
h cat NaN 1 NaN
i dog 7.0 2 NaN
j dog 3.0 1 NaN
#21对于每种动物类型和每种访问次数,求出平均年龄。换句话说,每一行都是动物,每一列都是访问次数,其值是平均年龄(提示:使用数据透视表)
# df.groupby(['animal','visits'])['age'].mean().reset_index()
df.pivot_table(index='animal', columns='visits', values='age', aggfunc='mean')
visits 1 2 3
animal
cat 2.5 NaN 2.5
dog 3.0 6.0 NaN
python 4.5 0.5 NaN
原文地址:https://www.cnblogs.com/Franciszw/p/14470400.html