pandas 实战笔记

pd.show_versions()
pd.__version__
pd.read_csv?           --help docu
pd.set_option?           --help docu
读取文件:
       oo=pd.read_csv('olympics.csv',skiprows=4)
       oo = pd.read_table('Z:/test.txt',header=None,encoding='gb2312',delim_whitespace=True,index_col=0)

       def convert_percent(val):
           """
           Convert the percentage string to an actual floating point percent
           - Remove %
           - Divide by 100 to make decimal
           """
           new_val = val.replace('%', '')
           return float(new_val) / 100

       df_2 = pd.read_csv("sales_data_types.csv",dtype={"Customer_Number":"int"},converters={
           "2016":convert_currency,
           "2017":convert_currency,
           "Percent Growth":convert_percent,
           "Jan Units":lambda x:pd.to_numeric(x,errors="coerce"),
           "Active":lambda x: np.where(x=="Y",True,False)})

输出
       to_csv('finally.csv',encoding='gb2312')#输出excel格式


显示列
   #显示所有列
   pd.set_option('display.max_columns', None)
   oo.columns.values                               #显示所有列名
   print([column for column in oo])               #显示所有列名

   oo.columns=oo.columns.str.strip()               #除列名中的空格
   oo = oo.rename(columns=lambda x: x.replace(" ","").replace("      ","").replace(" ","").replace(" ",""))   #除列名中的空格以及特殊字符

   oo['City']
   oo.city           --没有空格方可显示
   type(oo[['City','Sport','Event']])
   oo[['City','Sport','Event']].tail()
   oo.TS_NAME.str.len()                       #列长度
   oo.TS_NAME.str.strip()                       #某列里面字符串里面的空格 lstrip()表示去除左边空格 rstrip()表示去除左边空格 strip()表示去除左右两边，当中的空格仍在：
   oo['Sales'].str.replace(' ','')               #用replace 可以替换所有的空格：


显示行
   #设置value的显示长度为100，默认为50
   pd.set_option('max_colwidth',100)
   #显示所有行
   pd.set_option('display.max_rows', None)
   type(oo.iloc)
   type(oo.iloc[2])   显示第三行
   oo.drop_duplicates(['name'])           #以name删除重复行

总体信息显示
   oo.shape
   oo.shape[0]
   oo.shape[1]
   oo.head(10)
   oo.tail(10)
   oo.info() --数据的描述显示字段,数据类型, 使用内存大小,行数
   oo.dtypes
   type(oo.Edition)

删除行,列
   drop方法的用法：drop(labels, axis=0, level=None, inplace=False, errors='raise')
        -- axis为0时表示删除行，axis为1时表示删除列
       --   drop默认对原表不生效，如果要对原表生效，需要加参数：inplace=True
       -- 通过labels来控制删除行或列的个数，如果是删多行/多列，需写成labels=[1,3]，不能写成labels=[1:2],用:号会报错
       df2=df1.drop(labels=0)   # axis默认等于0，即按行删除，这里表示按行删除第0行
       df2=df1.drop(labels=[1,3],axis=0)   # axis=0 表示按行删除，删除第1行和第3行
       df2=df1.drop(labels=range(1,4),axis=0)   # axis=0 表示按行删除，删除索引值是第1行至第3行的正行数据
       df3=df1.drop(labels='gender',axis=1) # axis=1 表示按列删除，删除gender列
       df4=df1.drop(labels=['gender',"age"],axis=1) # axis=1 表示按列删除，删除gender、age列
格式转换
       .astype("int")
       .astype('float')
       .astype('bool')
       .apply(lambda x: x.replace(",","").replace("$","")).astype("float64")

       pd.to_numeric(df["Jan Units"],errors='coerce').fillna(0)           # pandas中pd.to_numeric()处理Jan Units中的数据
       pd.to_datetime(df[['Month', 'Day', 'Year']])                       # 最后利用pd.to_datatime()将年月日进行合并

       oo[['TS_SIZE_MB','TS_USED_MB','TS_FREE_MB','PCT_USED']]=oo[['TS_SIZE_MB','TS_USED_MB','TS_FREE_MB','PCT_USED']].astype('float')

       def convert_currency(var):
           """
           convert the string number to a float
           _ 去除$
           - 去除逗号，
           - 转化为浮点数类型
           """
           new_value = var.replace(",","").replace("$","")
           return float(new_value)