2.machinelearning的好伙伴pandas

文件链接和提取码

链接：https://pan.baidu.com/s/1Nwa9N5ah9Otkyrxv9-hFSQ
提取码：go0a

import pandas
"""
读取得到dataFrame结构
"""

# 读取数据
citi_info = pandas.read_csv('citi.csv')
print(type(citi_info))  # 输出文件类型
print(citi_info.dtypes)  # 输出文件中的元素名称和类型，注意：object为字符型
print(help(pandas.read_csv))  # read_csv的使用文本

<class 'pandas.core.frame.DataFrame'>
Date          object
Open         float64
High         float64
Low          float64
Close        float64
Volume         int64
Adj Close    float64
dtype: object
Help on function read_csv in module pandas.io.parsers:


# 显示某几行几列
print(citi_info.head(10))  # 将数据显示出来，但只显示前指定条数据
print(citi_info.tail(10))  # 将数据显示出来，但只显示尾部指定条数据

         Date       Open       High        Low      Close   Volume   Adj Close
0  2000-01-03  55.623610  55.623610  51.998701  52.998676  1681900  276.498726
1  2000-01-04  51.998701  52.186196  49.748757  49.748757  2403200  259.543615
2  2000-01-05  50.873729  51.998701  49.498763  51.748707  1742500  269.977529
3  2000-01-06  51.311218  54.686134  51.248720  54.248645  1863200  283.019922
4  2000-01-07  53.998651  54.936128  52.811181  53.998651  1394500  281.715683
5  2000-01-10  54.936128  54.936128  53.561162  53.811156   850300  280.737503
6  2000-01-11  53.373667  54.373642  52.873679  53.123673   996600  277.150845
7  2000-01-12  53.561162  54.998626  53.436165  54.998626  1145700  286.932640
8  2000-01-13  54.998626  56.061099  54.686134  55.623610  1237900  290.193238
9  2000-01-14  56.498589  58.561037  56.436090  57.998551  2225400  302.583511
            Date       Open       High        Low      Close    Volume  
4160  2016-07-18  44.279999  44.900002  44.240002  44.570000  18683900   
4161  2016-07-19  44.189999  44.689999  44.060001  44.349998  15297300   
4162  2016-07-20  44.529999  44.700001  44.200001  44.470001  16547100   
4163  2016-07-21  44.500000  44.700001  44.110001  44.130001  14924900   
4164  2016-07-22  44.099998  44.360001  43.830002  44.299999  12764200   
4165  2016-07-25  44.310001  44.360001  43.910000  44.040001  14391400   
4166  2016-07-26  43.930000  44.240002  43.900002  44.150002  16152500   
4167  2016-07-27  44.200001  44.709999  44.130001  44.290001  17814200   
4168  2016-07-28  44.000000  44.169998  43.680000  44.080002  13239600   
4169  2016-07-29  43.869999  44.160000  43.759998  43.810001  13773700   

      Adj Close  
4160  44.408987  
4161  44.189781  
4162  44.309350  
4163  43.970578  
4164  44.139962  
4165  43.880903  
4166  43.990506  
4167  44.130000  
4168  44.080002  
4169  43.810001  

# 显示元素名称以及元素类型
print(citi_info.columns)

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close'], dtype='object')



# 显示数据的大小
print(citi_info.shape)

(4170, 7)



# 对数据进行切片操作，显示某几列某几行
print(citi_info.loc[0])
print(citi_info.loc[1:3])
print(citi_info["Date"])
columns = ['Date','Open']
print(citi_info[columns])

Date         2000-01-03
Open            55.6236
High            55.6236
Low             51.9987
Close           52.9987
Volume          1681900
Adj Close       276.499
Name: 0, dtype: object
         Date       Open       High        Low      Close   Volume   Adj Close
1  2000-01-04  51.998701  52.186196  49.748757  49.748757  2403200  259.543615
2  2000-01-05  50.873729  51.998701  49.498763  51.748707  1742500  269.977529
3  2000-01-06  51.311218  54.686134  51.248720  54.248645  1863200  283.019922
0       2000-01-03
1       2000-01-04
2       2000-01-05
3       2000-01-06
4       2000-01-07
           ...    
4165    2016-07-25
4166    2016-07-26
4167    2016-07-27
4168    2016-07-28
4169    2016-07-29
Name: Date, Length: 4170, dtype: object
            Date       Open
0     2000-01-03  55.623610
1     2000-01-04  51.998701
2     2000-01-05  50.873729
3     2000-01-06  51.311218
4     2000-01-07  53.998651
...          ...        ...
4165  2016-07-25  44.310001
4166  2016-07-26  43.930000
4167  2016-07-27  44.200001
4168  2016-07-28  44.000000
4169  2016-07-29  43.869999

[4170 rows x 2 columns]



# 找出指定的元素的集合
print('找出指定的元素的集合')
columns = citi_info.columns.tolist()
print(columns)
gram_list = []
for c in columns:
    if c.endswith('e'):
        gram_list.append(c)
print(gram_list)
print(citi_info[gram_list])

找出指定的元素的集合
['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
['Date', 'Close', 'Volume', 'Adj Close']
            Date      Close    Volume   Adj Close
0     2000-01-03  52.998676   1681900  276.498726
1     2000-01-04  49.748757   2403200  259.543615
2     2000-01-05  51.748707   1742500  269.977529
3     2000-01-06  54.248645   1863200  283.019922
4     2000-01-07  53.998651   1394500  281.715683
...          ...        ...       ...         ...
4165  2016-07-25  44.040001  14391400   43.880903
4166  2016-07-26  44.150002  16152500   43.990506
4167  2016-07-27  44.290001  17814200   44.130000
4168  2016-07-28  44.080002  13239600   44.080002
4169  2016-07-29  43.810001  13773700   43.810001

[4170 rows x 4 columns]



# 对每一列进行数学运算
divided = citi_info['Close']*100
print(divided)
print(citi_info.shape)
subtraction = citi_info['High'] - citi_info['Low']
citi_info['output'] = subtraction
print(citi_info.shape)
print(citi_info['output'])

0       5299.8676
1       4974.8757
2       5174.8707
3       5424.8645
4       5399.8651
          ...    
4165    4404.0001
4166    4415.0002
4167    4429.0001
4168    4408.0002
4169    4381.0001
Name: Close, Length: 4170, dtype: float64
(4170, 7)
(4170, 8)
0       3.624909
1       2.437439
2       2.499938
3       3.437414
4       2.124947
          ...   
4165    0.450001
4166    0.340000
4167    0.579998
4168    0.489998
4169    0.400002
Name: output, Length: 4170, dtype: float64



# 对一列的极值,归一化操作
max_date = citi_info['High'].max()
print(max_date)
normalize_date = citi_info['High']/max_date
citi_info['High_normalize_date'] = normalize_date
print(citi_info['High_normalize_date'])

78.310544
0       0.710295
1       0.666401
2       0.664006
3       0.698324
4       0.701516
          ...   
4165    0.566463
4166    0.564930
4167    0.570932
4168    0.564036
4169    0.563909
Name: High_normalize_date, Length: 4170, dtype: float64



# 对数据进行排序
citi_info.sort_values('Close',inplace=True) # 升序替换原来列不生成新列
print(citi_info['Close'])
citi_info.sort_values('Close',inplace=True,ascending=False) # 降序
print(citi_info['Close'])

#

2305     1.020000
2306     1.030000
2307     1.050000
2304     1.130000
2302     1.200000
          ...    
162     75.373117
164     75.935603
163     76.748083
160     76.873080
161     77.310569
Name: Close, Length: 4170, dtype: float64
161     77.310569
160     76.873080
163     76.748083
164     75.935603
162     75.373117
          ...    
2302     1.200000
2304     1.130000
2307     1.050000
2306     1.030000
2305     1.020000
Name: Close, Leng

pandas调用函数扩展

import numpy as np
import pandas as pd
titanic_survival = pd.read_csv('titanic_train.csv')
print(titanic_survival.head())

age = titanic_survival["Age"]
print(age)
age_is_null = pd.isnull(age) # 判断是否缺失,保留年龄为空的数据
print(age_is_null)
age_null_true = age[age_is_null]  # 答应年龄为空的数据
print(age_null_true)
age_null_count = len(age_null_true)
print(age_null_count)

   PassengerId  Survived  Pclass  
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64
0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888     True
889    False
890    False
Name: Age, Length: 891, dtype: bool
5     NaN
17    NaN
19    NaN
26    NaN
28    NaN
       ..
859   NaN
863   NaN
868   NaN
878   NaN
888   NaN
Name: Age, Length: 177, dtype: float64
177


# 得到平均年龄错误做法
mean_age = sum(titanic_survival['Age']/len(titanic_survival["Age"]))
print(mean_age)

nan


# 去掉空值得到平均年龄
# 对于缺失值可以用均值，中值填充
good_age = titanic_survival["Age"][age_is_null==False]
correct_mean_age = sum(good_age)/len(good_age)
correct_mean_age2 = titanic_survival["Age"].mean()
print(correct_mean_age,correct_mean_age2)

29.69911764705882 29.69911764705882


# 对每个等级的船票价格统计求平均值
passenger_class = [1,2,3]
fares_by_class = {}
for this_class in passenger_class:
    Pclass_passengers = titanic_survival[titanic_survival['Pclass'] == this_class]
    Pclass_fares = Pclass_passengers["Fare"]
    mean_fare = Pclass_fares.mean()
    fares_by_class[this_class] = mean_fare
print(fares_by_class)

{1: 84.1546875, 2: 20.662183152173913, 3: 13.675550101832993}


# 快速统计量之间的关系
# 计算每个船舱等级的所获救人数的平均值
passenger_survival = titanic_survival.pivot_table(index = 'Pclass',values = 'Survived', aggfunc = np.mean)#对每个Pclass的平均获救人数
print(passenger_survival)

# 计算年龄在每个等级的船舱的平均值
passenger_age = titanic_survival.pivot_table(index='Pclass',values="Age",aggfunc=np.mean) # 默认aggfunc=np.mean
print(passenger_age)

# 计算每个码头收的钱总数和获救人数
port_stats = titanic_survival.pivot_table(index='Embarked', values=['Fare','Survived'],aggfunc=np.sum)
print(port_stats)

        Survived
Pclass          
1       0.629630
2       0.472826
3       0.242363
              Age
Pclass           
1       38.233441
2       29.877630
3       25.140620
                Fare  Survived
Embarked                      
C         10072.2962        93
Q          1022.2543        30
S         17439.3988       217


# 对缺失值的处理
drop_na_columns = titanic_survival.dropna(axis=1) # 去除有缺失值的行
new_titanic_survival = titanic_survival.dropna(axis=0,subset=['Age',"Sex"]) # 对指定列进行遍历，去除有缺失值的行
print(new_titanic_survival)

     PassengerId  Survived  Pclass  
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
885          886         0       3   
886          887         0       2   
887          888         1       1   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ...    ...   
885               Rice, Mrs. William (Margaret Norton)  female  39.0      0   
886                              Montvila, Rev. Juozas    male  27.0      0   
887                       Graham, Miss. Margaret Edith  female  19.0      0   
889                              Behr, Mr. Karl Howell    male  26.0      0   
890                                Dooley, Mr. Patrick    male  32.0      0   

     Parch            Ticket     Fare Cabin Embarked  
0        0         A/5 21171   7.2500   NaN        S  
1        0          PC 17599  71.2833   C85        C  
2        0  STON/O2. 3101282   7.9250   NaN        S  
3        0            113803  53.1000  C123        S  
4        0            373450   8.0500   NaN        S  
..     ...               ...      ...   ...      ...  
885      5            382652  29.1250   NaN        Q  
886      0            211536  13.0000   NaN        S  
887      0            112053  30.0000   B42        S  
889      0            111369  30.0000  C148        C  
890      0            370376   7.7500   NaN        Q  

[714 rows x 12 columns]


# 定位到一个具体值
passenger_83_age = titanic_survival.loc[83,'Age']
print('83号乘客的年龄：',passenger_83_age)

83号乘客的年龄： 28.0


# 对元素重新排列，并重新规划索引值
new_titanic_survival = titanic_survival.sort_values('Age',ascending=False)
titanic_reinex = new_titanic_survival.reset_index(drop=True)  # 将原来的索引值去除
print(titanic_reinex)

     PassengerId  Survived  Pclass                                      Name  
0            631         1       1      Barkworth, Mr. Algernon Henry Wilson   
1            852         0       3                       Svensson, Mr. Johan   
2            494         0       1                   Artagaveytia, Mr. Ramon   
3             97         0       1                 Goldschmidt, Mr. George B   
4            117         0       3                      Connors, Mr. Patrick   
..           ...       ...     ...                                       ...   
886          860         0       3                          Razi, Mr. Raihed   
887          864         0       3         Sage, Miss. Dorothy Edith "Dolly"   
888          869         0       3               van Melkebeke, Mr. Philemon   
889          879         0       3                        Laleff, Mr. Kristo   
890          889         0       3  Johnston, Miss. Catherine Helen "Carrie"   

        Sex   Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
0      male  80.0      0      0       27042  30.0000   A23        S  
1      male  74.0      0      0      347060   7.7750   NaN        S  
2      male  71.0      0      0    PC 17609  49.5042   NaN        C  
3      male  71.0      0      0    PC 17754  34.6542    A5        C  
4      male  70.5      0      0      370369   7.7500   NaN        Q  
..      ...   ...    ...    ...         ...      ...   ...      ...  
886    male   NaN      0      0        2629   7.2292   NaN        C  
887  female   NaN      8      2    CA. 2343  69.5500   NaN        S  
888    male   NaN      0      0      345777   9.5000   NaN        S  
889    male   NaN      0      0      349217   7.8958   NaN        S  
890  female   NaN      1      2  W./C. 6607  23.4500   NaN        S  

[891 rows x 12 columns]



# pandas自定义函数
def hundredth_row(column):
    hundredth_item = column.loc[99]
    return hundredth_item
hundredth_row = titanic_survival.apply(hundredth_row)
print(hundredth_row)

PassengerId                  100
Survived                       0
Pclass                         2
Name           Kantor, Mr. Sinai
Sex                         male
Age                           34
SibSp                          1
Parch                          0
Ticket                    244367
Fare                          26
Cabin                        NaN
Embarked                       S
dtype: object

pandas之series

from pandas import Series
import pandas as pd

"""
每一列为serie结构
"""

fandango = pd.read_csv('fandango_score_comparison.csv')
series_film = fandango["FILM"]
print(type(series_film))

<class 'pandas.core.series.Series'>


# 对指定元素进行切片
print(series_film[0:5])
series_rt = fandango["RottenTomatoes"]

film_names = series_film.values
print(type(film_names))  # pandas封装了numpy所以每个元素的数值集合是ndarray结构
rt_scores = series_rt.values

0    Avengers: Age of Ultron (2015)
1                 Cinderella (2015)
2                    Ant-Man (2015)
3            Do You Believe? (2015)
4     Hot Tub Time Machine 2 (2015)
Name: FILM, dtype: object
<class 'numpy.ndarray'>


# 以film_name为索引统计分数，输出固定值
series_custom = Series(rt_scores,index=film_names)
print(series_custom[['Cinderella (2015)','Ant-Man (2015)']])

Cinderella (2015)    85
Ant-Man (2015)       80
dtype: int64


# 输出第5到10的数据
fiveten = series_custom[5:10]
print(fiveten)

The Water Diviner (2015)        63
Irrational Man (2015)           42
Top Five (2014)                 86
Shaun the Sheep Movie (2015)    99
Love & Mercy (2015)             89
dtype: int64



# 设置索引值
set_film_index = fandango.set_index("FILM", drop=False)