pandas tutorial 2

@

import pandas as pd
import numpy as np

Group_By

ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
   'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
   'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
   'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
   'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
df = pd.DataFrame(ipl_data)
df
Team Rank Year Points
0 Riders 1 2014 876
1 Riders 2 2015 789
2 Devils 2 2014 863
3 Devils 3 2015 673
4 Kings 3 2014 741
5 kings 4 2015 812
6 Kings 1 2016 756
7 Kings 1 2017 788
8 Riders 2 2016 694
9 Royals 4 2014 701
10 Royals 1 2015 804
11 Riders 2 2017 690

对数据进行分组

有以下几种方式:

.groupby('key')

.groupby(['key1', 'key2'])

.groupby(key, axis=1)

df.groupby('Team')
<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x00000162D1D4A940>
df.groupby('Team').groups  #可视化
{'Devils': Int64Index([2, 3], dtype='int64'),
 'Kings': Int64Index([4, 6, 7], dtype='int64'),
 'Riders': Int64Index([0, 1, 8, 11], dtype='int64'),
 'Royals': Int64Index([9, 10], dtype='int64'),
 'kings': Int64Index([5], dtype='int64')}
df.groupby(['Team', 'Year']).groups
{('Devils', 2014): Int64Index([2], dtype='int64'),
 ('Devils', 2015): Int64Index([3], dtype='int64'),
 ('Kings', 2014): Int64Index([4], dtype='int64'),
 ('Kings', 2016): Int64Index([6], dtype='int64'),
 ('Kings', 2017): Int64Index([7], dtype='int64'),
 ('Riders', 2014): Int64Index([0], dtype='int64'),
 ('Riders', 2015): Int64Index([1], dtype='int64'),
 ('Riders', 2016): Int64Index([8], dtype='int64'),
 ('Riders', 2017): Int64Index([11], dtype='int64'),
 ('Royals', 2014): Int64Index([9], dtype='int64'),
 ('Royals', 2015): Int64Index([10], dtype='int64'),
 ('kings', 2015): Int64Index([5], dtype='int64')}

对 group进行迭代

grouped = df.groupby('Year')
for name, group in grouped:
    print(name)
    print(group)
2014
     Team  Rank  Year  Points
0  Riders     1  2014     876
2  Devils     2  2014     863
4   Kings     3  2014     741
9  Royals     4  2014     701
2015
      Team  Rank  Year  Points
1   Riders     2  2015     789
3   Devils     3  2015     673
5    kings     4  2015     812
10  Royals     1  2015     804
2016
     Team  Rank  Year  Points
6   Kings     1  2016     756
8  Riders     2  2016     694
2017
      Team  Rank  Year  Points
7    Kings     1  2017     788
11  Riders     2  2017     690

选择一个group get_group()

grouped.get_group(2014)
Team Rank Year Points
0 Riders 1 2014 876
2 Devils 2 2014 863
4 Kings 3 2014 741
9 Royals 4 2014 701

Aggregations 在group的基础上传入函数整合

grouped['Points']
<pandas.core.groupby.groupby.SeriesGroupBy object at 0x00000162D2DCF048>
grouped['Points'].agg(np.mean)
Year
2014    795.25
2015    769.50
2016    725.00
2017    739.00
Name: Points, dtype: float64
grouped.agg(np.mean)
Rank Points
Year
2014 2.5 795.25
2015 2.5 769.50
2016 1.5 725.00
2017 1.5 739.00
grouped = df.groupby('Team')
grouped.agg(np.size)
Rank Year Points
Team
Devils 2 2 2
Kings 3 3 3
Riders 4 4 4
Royals 2 2 2
kings 1 1 1
grouped.agg(len)
Rank Year Points
Team
Devils 2 2 2
Kings 3 3 3
Riders 4 4 4
Royals 2 2 2
kings 1 1 1
grouped.agg([np.sum, np.mean])
Rank Year Points
sum mean sum mean sum mean
Team
Devils 5 2.500000 4029 2014.500000 1536 768.000000
Kings 5 1.666667 6047 2015.666667 2285 761.666667
Riders 7 1.750000 8062 2015.500000 3049 762.250000
Royals 5 2.500000 4029 2014.500000 1505 752.500000
kings 4 4.000000 2015 2015.000000 812 812.000000

Transformations 在group的基础上传入函数变换

score = lambda x: (x - x.mean()) / x.std()*10
grouped.transform(score)
Rank Year Points
0 -15.000000 -11.618950 12.843272
1 5.000000 -3.872983 3.020286
2 -7.071068 -7.071068 7.071068
3 7.071068 7.071068 -7.071068
4 11.547005 -10.910895 -8.608621
5 NaN NaN NaN
6 -5.773503 2.182179 -2.360428
7 -5.773503 8.728716 10.969049
8 5.000000 3.872983 -7.705963
9 7.071068 -7.071068 -7.071068
10 -7.071068 7.071068 7.071068
11 5.000000 11.618950 -8.157595
def prin(x):
    print(x)
    print("*******")
    return 1
grouped.transform(prin)
2    2
3    3
Name: Rank, dtype: int64
*******
2    2014
3    2015
Name: Year, dtype: int64
*******
2    863
3    673
Name: Points, dtype: int64
*******
   Rank  Year  Points
2     2  2014     863
3     3  2015     673
*******
4    3
6    1
7    1
Name: Rank, dtype: int64
*******
4    2014
6    2016
7    2017
Name: Year, dtype: int64
*******
4    741
6    756
7    788
Name: Points, dtype: int64
*******
0     1
1     2
8     2
11    2
Name: Rank, dtype: int64
*******
0     2014
1     2015
8     2016
11    2017
Name: Year, dtype: int64
*******
0     876
1     789
8     694
11    690
Name: Points, dtype: int64
*******
9     4
10    1
Name: Rank, dtype: int64
*******
9     2014
10    2015
Name: Year, dtype: int64
*******
9     701
10    804
Name: Points, dtype: int64
*******
5    4
Name: Rank, dtype: int64
*******
5    2015
Name: Year, dtype: int64
*******
5    812
Name: Points, dtype: int64
*******
Rank Year Points
0 1 1 1
1 1 1 1
2 1 1 1
3 1 1 1
4 1 1 1
5 1 1 1
6 1 1 1
7 1 1 1
8 1 1 1
9 1 1 1
10 1 1 1
11 1 1 1

过滤

df.groupby('Team').filter(lambda x: len(x) >= 3)
Team Rank Year Points
0 Riders 1 2014 876
1 Riders 2 2015 789
4 Kings 3 2014 741
6 Kings 1 2016 756
7 Kings 1 2017 788
8 Riders 2 2016 694
11 Riders 2 2017 690
def prin(x):
    print(x)
    print("*******")
    if len(x) >=3:
        return True
    else:
        return False
df.groupby('Team').filter(prin)
     Team  Rank  Year  Points
2  Devils     2  2014     863
3  Devils     3  2015     673
*******
    Team  Rank  Year  Points
4  Kings     3  2014     741
6  Kings     1  2016     756
7  Kings     1  2017     788
*******
      Team  Rank  Year  Points
0   Riders     1  2014     876
1   Riders     2  2015     789
8   Riders     2  2016     694
11  Riders     2  2017     690
*******
      Team  Rank  Year  Points
9   Royals     4  2014     701
10  Royals     1  2015     804
*******
    Team  Rank  Year  Points
5  kings     4  2015     812
*******
Team Rank Year Points
0 Riders 1 2014 876
1 Riders 2 2015 789
4 Kings 3 2014 741
6 Kings 1 2016 756
7 Kings 1 2017 788
8 Riders 2 2016 694
11 Riders 2 2017 690

Merging/Joining

left = pd.DataFrame({
   'id':[1,2,3,4,5],
   'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
   'subject_id':['sub1','sub2','sub4','sub6','sub5']})
right = pd.DataFrame(
   {'id':[1,2,3,4,6],
   'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
   'subject_id':['sub2','sub4','sub3','sub6','sub5']})
left
id Name subject_id
0 1 Alex sub1
1 2 Amy sub2
2 3 Allen sub4
3 4 Alice sub6
4 5 Ayoung sub5
right
id Name subject_id
0 1 Billy sub2
1 2 Brian sub4
2 3 Bran sub3
3 4 Bryce sub6
4 6 Betty sub5
help(pd.merge)
Help on function merge in module pandas.core.reshape.merge:

merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)
    Merge DataFrame objects by performing a database-style join operation by
    columns or indexes.
    
    If joining columns on columns, the DataFrame indexes *will be
    ignored*. Otherwise if joining indexes on indexes or indexes on a column or
    columns, the index will be passed on.
    
    Parameters
    ----------
    left : DataFrame
    right : DataFrame
    how : {'left', 'right', 'outer', 'inner'}, default 'inner'
        * left: use only keys from left frame, similar to a SQL left outer join;
          preserve key order
        * right: use only keys from right frame, similar to a SQL right outer join;
          preserve key order
        * outer: use union of keys from both frames, similar to a SQL full outer
          join; sort keys lexicographically
        * inner: use intersection of keys from both frames, similar to a SQL inner
          join; preserve the order of the left keys
    on : label or list
        Column or index level names to join on. These must be found in both
        DataFrames. If `on` is None and not merging on indexes then this defaults
        to the intersection of the columns in both DataFrames.
    left_on : label or list, or array-like
        Column or index level names to join on in the left DataFrame. Can also
        be an array or list of arrays of the length of the left DataFrame.
        These arrays are treated as if they are columns.
    right_on : label or list, or array-like
        Column or index level names to join on in the right DataFrame. Can also
        be an array or list of arrays of the length of the right DataFrame.
        These arrays are treated as if they are columns.
    left_index : boolean, default False
        Use the index from the left DataFrame as the join key(s). If it is a
        MultiIndex, the number of keys in the other DataFrame (either the index
        or a number of columns) must match the number of levels
    right_index : boolean, default False
        Use the index from the right DataFrame as the join key. Same caveats as
        left_index
    sort : boolean, default False
        Sort the join keys lexicographically in the result DataFrame. If False,
        the order of the join keys depends on the join type (how keyword)
    suffixes : 2-length sequence (tuple, list, ...)
        Suffix to apply to overlapping column names in the left and right
        side, respectively
    copy : boolean, default True
        If False, do not copy data unnecessarily
    indicator : boolean or string, default False
        If True, adds a column to output DataFrame called "_merge" with
        information on the source of each row.
        If string, column with information on source of each row will be added to
        output DataFrame, and column will be named value of string.
        Information column is Categorical-type and takes on a value of "left_only"
        for observations whose merge key only appears in 'left' DataFrame,
        "right_only" for observations whose merge key only appears in 'right'
        DataFrame, and "both" if the observation's merge key is found in both.
    
    validate : string, default None
        If specified, checks if merge is of specified type.
    
        * "one_to_one" or "1:1": check if merge keys are unique in both
          left and right datasets.
        * "one_to_many" or "1:m": check if merge keys are unique in left
          dataset.
        * "many_to_one" or "m:1": check if merge keys are unique in right
          dataset.
        * "many_to_many" or "m:m": allowed, but does not result in checks.
    
        .. versionadded:: 0.21.0
    
    Notes
    -----
    Support for specifying index levels as the `on`, `left_on`, and
    `right_on` parameters was added in version 0.23.0
    
    Examples
    --------
    
    >>> A              >>> B
        lkey value         rkey value
    0   foo  1         0   foo  5
    1   bar  2         1   bar  6
    2   baz  3         2   qux  7
    3   foo  4         3   bar  8
    
    >>> A.merge(B, left_on='lkey', right_on='rkey', how='outer')
       lkey  value_x  rkey  value_y
    0  foo   1        foo   5
    1  foo   4        foo   5
    2  bar   2        bar   6
    3  bar   2        bar   8
    4  baz   3        NaN   NaN
    5  NaN   NaN      qux   7
    
    Returns
    -------
    merged : DataFrame
        The output type will the be same as 'left', if it is a subclass
        of DataFrame.
    
    See also
    --------
    merge_ordered
    merge_asof
    DataFrame.join
pd.merge(left, right, on='id')
id Name_x subject_id_x Name_y subject_id_y
0 1 Alex sub1 Billy sub2
1 2 Amy sub2 Brian sub4
2 3 Allen sub4 Bran sub3
3 4 Alice sub6 Bryce sub6
pd.merge(left, right, on=['id', 'subject_id'])
id Name_x subject_id Name_y
0 4 Alice sub6 Bryce

利用how

left = pd.DataFrame({
   'id':[1,2,3,4,5],
   'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
   'subject_id':['sub1','sub2','sub4','sub6','sub5']})
right = pd.DataFrame({
   'id':[1,2,3,4,5],
   'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
   'subject_id':['sub2','sub4','sub3','sub6','sub5']})
pd.merge(left, right, on="subject_id", how="left") #按照左边数据框的index
id_x Name_x subject_id id_y Name_y
0 1 Alex sub1 NaN NaN
1 2 Amy sub2 1.0 Billy
2 3 Allen sub4 2.0 Brian
3 4 Alice sub6 4.0 Bryce
4 5 Ayoung sub5 5.0 Betty
pd.merge(left, right, on="subject_id", how="right")  #按照右边数据框的index
id_x Name_x subject_id id_y Name_y
0 2.0 Amy sub2 1 Billy
1 3.0 Allen sub4 2 Brian
2 4.0 Alice sub6 4 Bryce
3 5.0 Ayoung sub5 5 Betty
4 NaN NaN sub3 3 Bran
pd.merge(left, right, on="subject_id", how="outer")  #左右数据框的并集
id_x Name_x subject_id id_y Name_y
0 1.0 Alex sub1 NaN NaN
1 2.0 Amy sub2 1.0 Billy
2 3.0 Allen sub4 2.0 Brian
3 4.0 Alice sub6 4.0 Bryce
4 5.0 Ayoung sub5 5.0 Betty
5 NaN NaN sub3 3.0 Bran
pd.merge(left, right, on="subject_id", how="inner") #默认  交集
id_x Name_x subject_id id_y Name_y
0 2 Amy sub2 1 Billy
1 3 Allen sub4 2 Brian
2 4 Alice sub6 4 Bryce
3 5 Ayoung sub5 5 Betty

concatenation

pd.concat()

help(pd.concat)
Help on function concat in module pandas.core.reshape.concat:

concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, sort=None, copy=True)
    Concatenate pandas objects along a particular axis with optional set logic
    along the other axes.
    
    Can also add a layer of hierarchical indexing on the concatenation axis,
    which may be useful if the labels are the same (or overlapping) on
    the passed axis number.
    
    Parameters
    ----------
    objs : a sequence or mapping of Series, DataFrame, or Panel objects
        If a dict is passed, the sorted keys will be used as the `keys`
        argument, unless it is passed, in which case the values will be
        selected (see below). Any None objects will be dropped silently unless
        they are all None in which case a ValueError will be raised
    axis : {0/'index', 1/'columns'}, default 0
        The axis to concatenate along
    join : {'inner', 'outer'}, default 'outer'
        How to handle indexes on other axis(es)
    join_axes : list of Index objects
        Specific indexes to use for the other n - 1 axes instead of performing
        inner/outer set logic
    ignore_index : boolean, default False
        If True, do not use the index values along the concatenation axis. The
        resulting axis will be labeled 0, ..., n - 1. This is useful if you are
        concatenating objects where the concatenation axis does not have
        meaningful indexing information. Note the index values on the other
        axes are still respected in the join.
    keys : sequence, default None
        If multiple levels passed, should contain tuples. Construct
        hierarchical index using the passed keys as the outermost level
    levels : list of sequences, default None
        Specific levels (unique values) to use for constructing a
        MultiIndex. Otherwise they will be inferred from the keys
    names : list, default None
        Names for the levels in the resulting hierarchical index
    verify_integrity : boolean, default False
        Check whether the new concatenated axis contains duplicates. This can
        be very expensive relative to the actual data concatenation
    sort : boolean, default None
        Sort non-concatenation axis if it is not already aligned when `join`
        is 'outer'. The current default of sorting is deprecated and will
        change to not-sorting in a future version of pandas.
    
        Explicitly pass ``sort=True`` to silence the warning and sort.
        Explicitly pass ``sort=False`` to silence the warning and not sort.
    
        This has no effect when ``join='inner'``, which already preserves
        the order of the non-concatenation axis.
    
        .. versionadded:: 0.23.0
    
    copy : boolean, default True
        If False, do not copy data unnecessarily
    
    Returns
    -------
    concatenated : object, type of objs
        When concatenating all ``Series`` along the index (axis=0), a
        ``Series`` is returned. When ``objs`` contains at least one
        ``DataFrame``, a ``DataFrame`` is returned. When concatenating along
        the columns (axis=1), a ``DataFrame`` is returned.
    
    Notes
    -----
    The keys, levels, and names arguments are all optional.
    
    A walkthrough of how this method fits in with other tools for combining
    pandas objects can be found `here
    <http://pandas.pydata.org/pandas-docs/stable/merging.html>`__.
    
    See Also
    --------
    Series.append
    DataFrame.append
    DataFrame.join
    DataFrame.merge
    
    Examples
    --------
    Combine two ``Series``.
    
    >>> s1 = pd.Series(['a', 'b'])
    >>> s2 = pd.Series(['c', 'd'])
    >>> pd.concat([s1, s2])
    0    a
    1    b
    0    c
    1    d
    dtype: object
    
    Clear the existing index and reset it in the result
    by setting the ``ignore_index`` option to ``True``.
    
    >>> pd.concat([s1, s2], ignore_index=True)
    0    a
    1    b
    2    c
    3    d
    dtype: object
    
    Add a hierarchical index at the outermost level of
    the data with the ``keys`` option.
    
    >>> pd.concat([s1, s2], keys=['s1', 's2',])
    s1  0    a
        1    b
    s2  0    c
        1    d
    dtype: object
    
    Label the index keys you create with the ``names`` option.
    
    >>> pd.concat([s1, s2], keys=['s1', 's2'],
    ...           names=['Series name', 'Row ID'])
    Series name  Row ID
    s1           0         a
                 1         b
    s2           0         c
                 1         d
    dtype: object
    
    Combine two ``DataFrame`` objects with identical columns.
    
    >>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
    ...                    columns=['letter', 'number'])
    >>> df1
      letter  number
    0      a       1
    1      b       2
    >>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
    ...                    columns=['letter', 'number'])
    >>> df2
      letter  number
    0      c       3
    1      d       4
    >>> pd.concat([df1, df2])
      letter  number
    0      a       1
    1      b       2
    0      c       3
    1      d       4
    
    Combine ``DataFrame`` objects with overlapping columns
    and return everything. Columns outside the intersection will
    be filled with ``NaN`` values.
    
    >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
    ...                    columns=['letter', 'number', 'animal'])
    >>> df3
      letter  number animal
    0      c       3    cat
    1      d       4    dog
    >>> pd.concat([df1, df3])
      animal letter  number
    0    NaN      a       1
    1    NaN      b       2
    0    cat      c       3
    1    dog      d       4
    
    Combine ``DataFrame`` objects with overlapping columns
    and return only those that are shared by passing ``inner`` to
    the ``join`` keyword argument.
    
    >>> pd.concat([df1, df3], join="inner")
      letter  number
    0      a       1
    1      b       2
    0      c       3
    1      d       4
    
    Combine ``DataFrame`` objects horizontally along the x axis by
    passing in ``axis=1``.
    
    >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
    ...                    columns=['animal', 'name'])
    >>> pd.concat([df1, df4], axis=1)
      letter  number  animal    name
    0      a       1    bird   polly
    1      b       2  monkey  george
    
    Prevent the result from including duplicate index values with the
    ``verify_integrity`` option.
    
    >>> df5 = pd.DataFrame([1], index=['a'])
    >>> df5
       0
    a  1
    >>> df6 = pd.DataFrame([2], index=['a'])
    >>> df6
       0
    a  2
    >>> pd.concat([df5, df6], verify_integrity=True)
    Traceback (most recent call last):
        ...
    ValueError: Indexes have overlapping values: ['a']
one = pd.DataFrame({
   'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
   'subject_id':['sub1','sub2','sub4','sub6','sub5'],
   'Marks_scored':[98,90,87,69,78]},
   index=[1,2,3,4,5])

two = pd.DataFrame({
   'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
   'subject_id':['sub2','sub4','sub3','sub6','sub5'],
   'Marks_scored':[89,80,79,97,88]},
   index=[1,2,3,4,5])
one
Name subject_id Marks_scored
1 Alex sub1 98
2 Amy sub2 90
3 Allen sub4 87
4 Alice sub6 69
5 Ayoung sub5 78
two
Name subject_id Marks_scored
1 Billy sub2 89
2 Brian sub4 80
3 Bran sub3 79
4 Bryce sub6 97
5 Betty sub5 88
df = pd.concat([one, two])
df
Name subject_id Marks_scored
1 Alex sub1 98
2 Amy sub2 90
3 Allen sub4 87
4 Alice sub6 69
5 Ayoung sub5 78
1 Billy sub2 89
2 Brian sub4 80
3 Bran sub3 79
4 Bryce sub6 97
5 Betty sub5 88

注意到上面的index是重复的

df.iloc[0]
Name            Alex
subject_id      sub1
Marks_scored      98
Name: 1, dtype: object
df.loc[1]
Name subject_id Marks_scored
1 Alex sub1 98
1 Billy sub2 89
df = pd.concat([one, two], keys=['one', 'two'])
df
Name subject_id Marks_scored
one 1 Alex sub1 98
2 Amy sub2 90
3 Allen sub4 87
4 Alice sub6 69
5 Ayoung sub5 78
two 1 Billy sub2 89
2 Brian sub4 80
3 Bran sub3 79
4 Bryce sub6 97
5 Betty sub5 88
df.iloc[1]
Name             Amy
subject_id      sub2
Marks_scored      90
Name: (one, 2), dtype: object
df.loc[('one', 2)]
Name             Amy
subject_id      sub2
Marks_scored      90
Name: (one, 2), dtype: object

想要让index不重复,可以利用ignore_index

pd.concat([one, two], keys=['x', 'y'], ignore_index=True)
Name subject_id Marks_scored
0 Alex sub1 98
1 Amy sub2 90
2 Allen sub4 87
3 Alice sub6 69
4 Ayoung sub5 78
5 Billy sub2 89
6 Brian sub4 80
7 Bran sub3 79
8 Bryce sub6 97
9 Betty sub5 88

此时keys也被覆写了

.append()

one.append(two)
Name subject_id Marks_scored
1 Alex sub1 98
2 Amy sub2 90
3 Allen sub4 87
4 Alice sub6 69
5 Ayoung sub5 78
1 Billy sub2 89
2 Brian sub4 80
3 Bran sub3 79
4 Bryce sub6 97
5 Betty sub5 88
one.append([two, one, two])
Name subject_id Marks_scored
1 Alex sub1 98
2 Amy sub2 90
3 Allen sub4 87
4 Alice sub6 69
5 Ayoung sub5 78
1 Billy sub2 89
2 Brian sub4 80
3 Bran sub3 79
4 Bryce sub6 97
5 Betty sub5 88
1 Alex sub1 98
2 Amy sub2 90
3 Allen sub4 87
4 Alice sub6 69
5 Ayoung sub5 78
1 Billy sub2 89
2 Brian sub4 80
3 Bran sub3 79
4 Bryce sub6 97
5 Betty sub5 88

时间序列

datetime.now()

pd.datetime.now()
datetime.datetime(2019, 9, 5, 21, 9, 55, 821684)
pd.Timestamp(1587687255, unit='s')
Timestamp('2020-04-24 00:14:15')

创建时间序列

pd.date_range("11:00", "13:30", freq="30min").time
array([datetime.time(11, 0), datetime.time(11, 30), datetime.time(12, 0),
       datetime.time(12, 30), datetime.time(13, 0), datetime.time(13, 30)],
      dtype=object)
pd.date_range("11:00", "13:30", freq="h").time
array([datetime.time(11, 0), datetime.time(12, 0), datetime.time(13, 0)],
      dtype=object)
pd.to_datetime(pd.Series(['Jul 31, 2009','2010-01-10', None]))
0   2009-07-31
1   2010-01-10
2          NaT
dtype: datetime64[ns]
pd.to_datetime(['2005/11/23 00:14:15', '2010.12.31', None])
DatetimeIndex(['2005-11-23 00:14:15', '2010-12-31 00:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

period freq

pd.date_range('1/1/2011', periods=5)
DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', '2011-01-04',
               '2011-01-05'],
              dtype='datetime64[ns]', freq='D')
help(pd.date_range)
Help on function date_range in module pandas.core.indexes.datetimes:

date_range(start=None, end=None, periods=None, freq=None, tz=None, normalize=False, name=None, closed=None, **kwargs)
    Return a fixed frequency DatetimeIndex.
    
    Parameters
    ----------
    start : str or datetime-like, optional
        Left bound for generating dates.
    end : str or datetime-like, optional
        Right bound for generating dates.
    periods : integer, optional
        Number of periods to generate.
    freq : str or DateOffset, default 'D' (calendar daily)
        Frequency strings can have multiples, e.g. '5H'. See
        :ref:`here <timeseries.offset_aliases>` for a list of
        frequency aliases.
    tz : str or tzinfo, optional
        Time zone name for returning localized DatetimeIndex, for example
        'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is
        timezone-naive.
    normalize : bool, default False
        Normalize start/end dates to midnight before generating date range.
    name : str, default None
        Name of the resulting DatetimeIndex.
    closed : {None, 'left', 'right'}, optional
        Make the interval closed with respect to the given frequency to
        the 'left', 'right', or both sides (None, the default).
    **kwargs
        For compatibility. Has no effect on the result.
    
    Returns
    -------
    rng : DatetimeIndex
    
    See Also
    --------
    pandas.DatetimeIndex : An immutable container for datetimes.
    pandas.timedelta_range : Return a fixed frequency TimedeltaIndex.
    pandas.period_range : Return a fixed frequency PeriodIndex.
    pandas.interval_range : Return a fixed frequency IntervalIndex.
    
    Notes
    -----
    Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,
    exactly three must be specified. If ``freq`` is omitted, the resulting
    ``DatetimeIndex`` will have ``periods`` linearly spaced elements between
    ``start`` and ``end`` (closed on both sides).
    
    To learn more about the frequency strings, please see `this link
    <http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases>`__.
    
    Examples
    --------
    **Specifying the values**
    
    The next four examples generate the same `DatetimeIndex`, but vary
    the combination of `start`, `end` and `periods`.
    
    Specify `start` and `end`, with the default daily frequency.
    
    >>> pd.date_range(start='1/1/2018', end='1/08/2018')
    DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
                   '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
                  dtype='datetime64[ns]', freq='D')
    
    Specify `start` and `periods`, the number of periods (days).
    
    >>> pd.date_range(start='1/1/2018', periods=8)
    DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
                   '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
                  dtype='datetime64[ns]', freq='D')
    
    Specify `end` and `periods`, the number of periods (days).
    
    >>> pd.date_range(end='1/1/2018', periods=8)
    DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28',
                   '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'],
                  dtype='datetime64[ns]', freq='D')
    
    Specify `start`, `end`, and `periods`; the frequency is generated
    automatically (linearly spaced).
    
    >>> pd.date_range(start='2018-04-24', end='2018-04-27', periods=3)
    DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00',
                   '2018-04-27 00:00:00'], freq=None)
    
    **Other Parameters**
    
    Changed the `freq` (frequency) to ``'M'`` (month end frequency).
    
    >>> pd.date_range(start='1/1/2018', periods=5, freq='M')
    DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',
                   '2018-05-31'],
                  dtype='datetime64[ns]', freq='M')
    
    Multiples are allowed
    
    >>> pd.date_range(start='1/1/2018', periods=5, freq='3M')
    DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
                   '2019-01-31'],
                  dtype='datetime64[ns]', freq='3M')
    
    `freq` can also be specified as an Offset object.
    
    >>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3))
    DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
                   '2019-01-31'],
                  dtype='datetime64[ns]', freq='3M')
    
    Specify `tz` to set the timezone.
    
    >>> pd.date_range(start='1/1/2018', periods=5, tz='Asia/Tokyo')
    DatetimeIndex(['2018-01-01 00:00:00+09:00', '2018-01-02 00:00:00+09:00',
                   '2018-01-03 00:00:00+09:00', '2018-01-04 00:00:00+09:00',
                   '2018-01-05 00:00:00+09:00'],
                  dtype='datetime64[ns, Asia/Tokyo]', freq='D')
    
    `closed` controls whether to include `start` and `end` that are on the
    boundary. The default includes boundary points on either end.
    
    >>> pd.date_range(start='2017-01-01', end='2017-01-04', closed=None)
    DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'],
                  dtype='datetime64[ns]', freq='D')
    
    Use ``closed='left'`` to exclude `end` if it falls on the boundary.
    
    >>> pd.date_range(start='2017-01-01', end='2017-01-04', closed='left')
    DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'],
                  dtype='datetime64[ns]', freq='D')
    
    Use ``closed='right'`` to exclude `start` if it falls on the boundary.
    
    >>> pd.date_range(start='2017-01-01', end='2017-01-04', closed='right')
    DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'],
                  dtype='datetime64[ns]', freq='D')
pd.date_range('1/1/2011', periods=5, freq='M')
DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31', '2011-04-30',
               '2011-05-31'],
              dtype='datetime64[ns]', freq='M')

bdate_range

去掉周六周日

pd.bdate_range('9/6/2019', periods=5)
DatetimeIndex(['2019-09-06', '2019-09-09', '2019-09-10', '2019-09-11',
               '2019-09-12'],
              dtype='datetime64[ns]', freq='B')
pd.bdate_range('9/7/2019', periods=5)
DatetimeIndex(['2019-09-09', '2019-09-10', '2019-09-11', '2019-09-12',
               '2019-09-13'],
              dtype='datetime64[ns]', freq='B')
start= pd.datetime(2011, 1, 1)
end = pd.datetime(2011, 1, 5)
pd.date_range(start, end)
DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', '2011-01-04',
               '2011-01-05'],
              dtype='datetime64[ns]', freq='D')

一些freq的简写, D, M, Y...

Alias Description Alias Description
B business day frequency BQS business quarter start frequency
D calendar day frequency A annual(Year) end frequency
W weekly frequency BA business year end frequency
M month end frequency BAS business year start frequency
SM semi-month end frequency BH business hour frequency
BM business month end frequency H hourly frequency
MS month start frequency T, min minutely frequency
SMS SMS semi month start frequency S secondly frequency
BMS business month start frequency L, ms milliseconds
Q quarter end frequency U, us microseconds
BQ business quarter end frequency N nanoseconds
QS quarter start frequency
pd.date_range('9/6/2019', periods=5, freq='2W')
DatetimeIndex(['2019-09-08', '2019-09-22', '2019-10-06', '2019-10-20',
               '2019-11-03'],
              dtype='datetime64[ns]', freq='2W-SUN')

Timedelta 用以描述时间差

help(pd.Timedelta)
Help on class Timedelta in module pandas._libs.tslibs.timedeltas:

class Timedelta(_Timedelta)
 |  Timedelta(value=<object object at 0x000002BFBD410440>, unit=None, **kwargs)
 |  
 |  Represents a duration, the difference between two dates or times.
 |  
 |  Timedelta is the pandas equivalent of python's ``datetime.timedelta``
 |  and is interchangeable with it in most cases.
 |  
 |  Parameters
 |  ----------
 |  value : Timedelta, timedelta, np.timedelta64, string, or integer
 |  unit : string, {'ns', 'us', 'ms', 's', 'm', 'h', 'D'}, optional
 |      Denote the unit of the input, if input is an integer. Default 'ns'.
 |  days, seconds, microseconds,
 |  milliseconds, minutes, hours, weeks : numeric, optional
 |      Values for construction in compat with datetime.timedelta.
 |      np ints and floats will be coereced to python ints and floats.
 |  
 |  Notes
 |  -----
 |  The ``.value`` attribute is always in ns.
 |  
 |  Method resolution order:
 |      Timedelta
 |      _Timedelta
 |      datetime.timedelta
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __abs__(self)
 |  
 |  __add__(self, other)
 |  
 |  __divmod__(self, other)
 |  
 |  __floordiv__(self, other)
 |  
 |  __inv__(self)
 |  
 |  __mod__(self, other)
 |  
 |  __mul__(self, other)
 |  
 |  __neg__(self)
 |  
 |  __new__(cls, value=<object object at 0x000002BFBD410440>, unit=None, **kwargs)
 |  
 |  __pos__(self)
 |  
 |  __radd__(self, other)
 |  
 |  __rdivmod__(self, other)
 |  
 |  __reduce__(self)
 |  
 |  __rfloordiv__(self, other)
 |  
 |  __rmod__(self, other)
 |  
 |  __rmul__ = __mul__(self, other)
 |  
 |  __rsub__(self, other)
 |  
 |  __rtruediv__(self, other)
 |  
 |  __setstate__(self, state)
 |  
 |  __sub__(self, other)
 |  
 |  __truediv__(self, other)
 |  
 |  ceil(self, freq)
 |      return a new Timedelta ceiled to this resolution
 |      
 |      Parameters
 |      ----------
 |      freq : a freq string indicating the ceiling resolution
 |  
 |  floor(self, freq)
 |      return a new Timedelta floored to this resolution
 |      
 |      Parameters
 |      ----------
 |      freq : a freq string indicating the flooring resolution
 |  
 |  round(self, freq)
 |      Round the Timedelta to the specified resolution
 |      
 |      Returns
 |      -------
 |      a new Timedelta rounded to the given resolution of `freq`
 |      
 |      Parameters
 |      ----------
 |      freq : a freq string indicating the rounding resolution
 |      
 |      Raises
 |      ------
 |      ValueError if the freq cannot be converted
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  max = Timedelta('106751 days 23:47:16.854775')
 |  
 |  min = Timedelta('-106752 days +00:12:43.145224')
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from _Timedelta:
 |  
 |  __bool__(self, /)
 |      self != 0
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __gt__(self, value, /)
 |      Return self>value.
 |  
 |  __hash__(self, /)
 |      Return hash(self).
 |  
 |  __le__(self, value, /)
 |      Return self<=value.
 |  
 |  __lt__(self, value, /)
 |      Return self<value.
 |  
 |  __ne__(self, value, /)
 |      Return self!=value.
 |  
 |  __reduce_cython__(...)
 |  
 |  __repr__(self, /)
 |      Return repr(self).
 |  
 |  __setstate_cython__(...)
 |  
 |  __str__(self, /)
 |      Return str(self).
 |  
 |  isoformat(...)
 |      Format Timedelta as ISO 8601 Duration like
 |      ``P[n]Y[n]M[n]DT[n]H[n]M[n]S``, where the ``[n]`` s are replaced by the
 |      values. See https://en.wikipedia.org/wiki/ISO_8601#Durations
 |      
 |      .. versionadded:: 0.20.0
 |      
 |      Returns
 |      -------
 |      formatted : str
 |      
 |      Notes
 |      -----
 |      The longest component is days, whose value may be larger than
 |      365.
 |      Every component is always included, even if its value is 0.
 |      Pandas uses nanosecond precision, so up to 9 decimal places may
 |      be included in the seconds component.
 |      Trailing 0's are removed from the seconds component after the decimal.
 |      We do not 0 pad components, so it's `...T5H...`, not `...T05H...`
 |      
 |      Examples
 |      --------
 |      >>> td = pd.Timedelta(days=6, minutes=50, seconds=3,
 |      ...                   milliseconds=10, microseconds=10, nanoseconds=12)
 |      >>> td.isoformat()
 |      'P6DT0H50M3.010010012S'
 |      >>> pd.Timedelta(hours=1, seconds=10).isoformat()
 |      'P0DT0H0M10S'
 |      >>> pd.Timedelta(hours=1, seconds=10).isoformat()
 |      'P0DT0H0M10S'
 |      >>> pd.Timedelta(days=500.5).isoformat()
 |      'P500DT12H0MS'
 |      
 |      See Also
 |      --------
 |      Timestamp.isoformat
 |  
 |  to_pytimedelta(...)
 |      return an actual datetime.timedelta object
 |      note: we lose nanosecond resolution if any
 |  
 |  to_timedelta64(...)
 |      Returns a numpy.timedelta64 object with 'ns' precision
 |  
 |  total_seconds(...)
 |      Total duration of timedelta in seconds (to ns precision)
 |  
 |  view(...)
 |      array view compat
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from _Timedelta:
 |  
 |  asm8
 |      return a numpy timedelta64 array view of myself
 |  
 |  components
 |      Return a Components NamedTuple-like
 |  
 |  delta
 |      Return the timedelta in nanoseconds (ns), for internal compatibility.
 |      
 |      Returns
 |      -------
 |      int
 |          Timedelta in nanoseconds.
 |      
 |      Examples
 |      --------
 |      >>> td = pd.Timedelta('1 days 42 ns')
 |      >>> td.delta
 |      86400000000042
 |      
 |      >>> td = pd.Timedelta('3 s')
 |      >>> td.delta
 |      3000000000
 |      
 |      >>> td = pd.Timedelta('3 ms 5 us')
 |      >>> td.delta
 |      3005000
 |      
 |      >>> td = pd.Timedelta(42, unit='ns')
 |      >>> td.delta
 |      42
 |  
 |  freq
 |  
 |  is_populated
 |  
 |  nanoseconds
 |      Return the number of nanoseconds (n), where 0 <= n < 1 microsecond.
 |      
 |      Returns
 |      -------
 |      int
 |          Number of nanoseconds.
 |      
 |      See Also
 |      --------
 |      Timedelta.components : Return all attributes with assigned values
 |          (i.e. days, hours, minutes, seconds, milliseconds, microseconds,
 |          nanoseconds).
 |      
 |      Examples
 |      --------
 |      **Using string input**
 |      
 |      >>> td = pd.Timedelta('1 days 2 min 3 us 42 ns')
 |      >>> td.nanoseconds
 |      42
 |      
 |      **Using integer input**
 |      
 |      >>> td = pd.Timedelta(42, unit='ns')
 |      >>> td.nanoseconds
 |      42
 |  
 |  resolution
 |      return a string representing the lowest resolution that we have
 |  
 |  value
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes inherited from _Timedelta:
 |  
 |  __array_priority__ = 100
 |  
 |  __pyx_vtable__ = <capsule object NULL>
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from datetime.timedelta:
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from datetime.timedelta:
 |  
 |  days
 |      Number of days.
 |  
 |  microseconds
 |      Number of microseconds (>= 0 and less than 1 second).
 |  
 |  seconds
 |      Number of seconds (>= 0 and less than 1 day).
pd.Timedelta('2 days 2 hours 15 minutes 30 seconds')
Timedelta('2 days 02:15:30')
pd.Timedelta(6, unit='h')
Timedelta('0 days 06:00:00')
pd.Timedelta(days=2)
Timedelta('2 days 00:00:00')

to_timedelta()

pd.to_timedelta(['2 days 2 hours 15 minutes 30 seconds', '2 days 2 hours 15 minutes 30 seconds'])
TimedeltaIndex(['2 days 02:15:30', '2 days 02:15:30'], dtype='timedelta64[ns]', freq=None)
s = pd.Series(pd.date_range('2012-1-1', periods=3, freq='D'))
td = pd.Series([ pd.Timedelta(days=i) for i in range(3) ])
df = pd.DataFrame(dict(A = s, B = td))
df
A B
0 2012-01-01 0 days
1 2012-01-02 1 days
2 2012-01-03 2 days
df['C'] = df['A'] + df['B']
df
A B C
0 2012-01-01 0 days 2012-01-01
1 2012-01-02 1 days 2012-01-03
2 2012-01-03 2 days 2012-01-05
df['D'] = df['C'] - df['B']
df
A B C D
0 2012-01-01 0 days 2012-01-01 2012-01-01
1 2012-01-02 1 days 2012-01-03 2012-01-02
2 2012-01-03 2 days 2012-01-05 2012-01-03

Categorical Data

category

s = pd.Series(["a", 'b', 'c', 'a'], dtype='category')
s
0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

pd.Categorical

pandas.Categorical(values, categories, ordered)

help(pd.Categorical)
Help on class Categorical in module pandas.core.arrays.categorical:

class Categorical(pandas.core.arrays.base.ExtensionArray, pandas.core.base.PandasObject)
 |  Categorical(values, categories=None, ordered=None, dtype=None, fastpath=False)
 |  
 |  Represents a categorical variable in classic R / S-plus fashion
 |  
 |  `Categoricals` can only take on only a limited, and usually fixed, number
 |  of possible values (`categories`). In contrast to statistical categorical
 |  variables, a `Categorical` might have an order, but numerical operations
 |  (additions, divisions, ...) are not possible.
 |  
 |  All values of the `Categorical` are either in `categories` or `np.nan`.
 |  Assigning values outside of `categories` will raise a `ValueError`. Order
 |  is defined by the order of the `categories`, not lexical order of the
 |  values.
 |  
 |  Parameters
 |  ----------
 |  values : list-like
 |      The values of the categorical. If categories are given, values not in
 |      categories will be replaced with NaN.
 |  categories : Index-like (unique), optional
 |      The unique categories for this categorical. If not given, the
 |      categories are assumed to be the unique values of values.
 |  ordered : boolean, (default False)
 |      Whether or not this categorical is treated as a ordered categorical.
 |      If not given, the resulting categorical will not be ordered.
 |  dtype : CategoricalDtype
 |      An instance of ``CategoricalDtype`` to use for this categorical
 |  
 |      .. versionadded:: 0.21.0
 |  
 |  Attributes
 |  ----------
 |  categories : Index
 |      The categories of this categorical
 |  codes : ndarray
 |      The codes (integer positions, which point to the categories) of this
 |      categorical, read only.
 |  ordered : boolean
 |      Whether or not this Categorical is ordered.
 |  dtype : CategoricalDtype
 |      The instance of ``CategoricalDtype`` storing the ``categories``
 |      and ``ordered``.
 |  
 |      .. versionadded:: 0.21.0
 |  
 |  Methods
 |  -------
 |  from_codes
 |  __array__
 |  
 |  Raises
 |  ------
 |  ValueError
 |      If the categories do not validate.
 |  TypeError
 |      If an explicit ``ordered=True`` is given but no `categories` and the
 |      `values` are not sortable.
 |  
 |  Examples
 |  --------
 |  >>> pd.Categorical([1, 2, 3, 1, 2, 3])
 |  [1, 2, 3, 1, 2, 3]
 |  Categories (3, int64): [1, 2, 3]
 |  
 |  >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
 |  [a, b, c, a, b, c]
 |  Categories (3, object): [a, b, c]
 |  
 |  Ordered `Categoricals` can be sorted according to the custom order
 |  of the categories and can have a min and max value.
 |  
 |  >>> c = pd.Categorical(['a','b','c','a','b','c'], ordered=True,
 |  ...                    categories=['c', 'b', 'a'])
 |  >>> c
 |  [a, b, c, a, b, c]
 |  Categories (3, object): [c < b < a]
 |  >>> c.min()
 |  'c'
 |  
 |  Notes
 |  -----
 |  See the `user guide
 |  <http://pandas.pydata.org/pandas-docs/stable/categorical.html>`_ for more.
 |  
 |  See also
 |  --------
 |  pandas.api.types.CategoricalDtype : Type for categorical data
 |  CategoricalIndex : An Index with an underlying ``Categorical``
 |  
 |  Method resolution order:
 |      Categorical
 |      pandas.core.arrays.base.ExtensionArray
 |      pandas.core.base.PandasObject
 |      pandas.core.base.StringMixin
 |      pandas.core.accessor.DirNamesMixin
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __array__(self, dtype=None)
 |      The numpy array interface.
 |      
 |      Returns
 |      -------
 |      values : numpy array
 |          A numpy array of either the specified dtype or,
 |          if dtype==None (default), the same dtype as
 |          categorical.categories.dtype
 |  
 |  __eq__(self, other)
 |  
 |  __ge__(self, other)
 |  
 |  __getitem__(self, key)
 |      Return an item.
 |  
 |  __gt__(self, other)
 |  
 |  __init__(self, values, categories=None, ordered=None, dtype=None, fastpath=False)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __iter__(self)
 |      Returns an Iterator over the values of this Categorical.
 |  
 |  __le__(self, other)
 |  
 |  __len__(self)
 |      The length of this Categorical.
 |  
 |  __lt__(self, other)
 |  
 |  __ne__(self, other)
 |  
 |  __setitem__(self, key, value)
 |      Item assignment.
 |      
 |      
 |      Raises
 |      ------
 |      ValueError
 |          If (one or more) Value is not in categories or if a assigned
 |          `Categorical` does not have the same categories
 |  
 |  __setstate__(self, state)
 |      Necessary for making this object picklable
 |  
 |  __unicode__(self)
 |      Unicode representation.
 |  
 |  add_categories(self, new_categories, inplace=False)
 |      Add new categories.
 |      
 |      `new_categories` will be included at the last/highest place in the
 |      categories and will be unused directly after this call.
 |      
 |      Raises
 |      ------
 |      ValueError
 |          If the new categories include old categories or do not validate as
 |          categories
 |      
 |      Parameters
 |      ----------
 |      new_categories : category or list-like of category
 |         The new categories to be included.
 |      inplace : boolean (default: False)
 |         Whether or not to add the categories inplace or return a copy of
 |         this categorical with added categories.
 |      
 |      Returns
 |      -------
 |      cat : Categorical with new categories added or None if inplace.
 |      
 |      See also
 |      --------
 |      rename_categories
 |      reorder_categories
 |      remove_categories
 |      remove_unused_categories
 |      set_categories
 |  
 |  argsort(self, *args, **kwargs)
 |      Return the indicies that would sort the Categorical.
 |      
 |      Parameters
 |      ----------
 |      ascending : bool, default True
 |          Whether the indices should result in an ascending
 |          or descending sort.
 |      kind : {'quicksort', 'mergesort', 'heapsort'}, optional
 |          Sorting algorithm.
 |      *args, **kwargs:
 |          passed through to :func:`numpy.argsort`.
 |      
 |      Returns
 |      -------
 |      argsorted : numpy array
 |      
 |      See also
 |      --------
 |      numpy.ndarray.argsort
 |      
 |      Notes
 |      -----
 |      While an ordering is applied to the category values, arg-sorting
 |      in this context refers more to organizing and grouping together
 |      based on matching category values. Thus, this function can be
 |      called on an unordered Categorical instance unlike the functions
 |      'Categorical.min' and 'Categorical.max'.
 |      
 |      Examples
 |      --------
 |      >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort()
 |      array([2, 0, 1, 3])
 |      
 |      >>> cat = pd.Categorical(['b', 'b', 'a', 'c'],
 |      ...                      categories=['c', 'b', 'a'],
 |      ...                      ordered=True)
 |      >>> cat.argsort()
 |      array([3, 0, 1, 2])
 |  
 |  as_ordered(self, inplace=False)
 |      Sets the Categorical to be ordered
 |      
 |      Parameters
 |      ----------
 |      inplace : boolean (default: False)
 |         Whether or not to set the ordered attribute inplace or return a copy
 |         of this categorical with ordered set to True
 |  
 |  as_unordered(self, inplace=False)
 |      Sets the Categorical to be unordered
 |      
 |      Parameters
 |      ----------
 |      inplace : boolean (default: False)
 |         Whether or not to set the ordered attribute inplace or return a copy
 |         of this categorical with ordered set to False
 |  
 |  astype(self, dtype, copy=True)
 |      Coerce this type to another dtype
 |      
 |      Parameters
 |      ----------
 |      dtype : numpy dtype or pandas type
 |      copy : bool, default True
 |          By default, astype always returns a newly allocated object.
 |          If copy is set to False and dtype is categorical, the original
 |          object is returned.
 |      
 |          .. versionadded:: 0.19.0
 |  
 |  check_for_ordered(self, op)
 |      assert that we are ordered
 |  
 |  copy(self)
 |      Copy constructor.
 |  
 |  describe(self)
 |      Describes this Categorical
 |      
 |      Returns
 |      -------
 |      description: `DataFrame`
 |          A dataframe with frequency and counts by category.
 |  
 |  dropna(self)
 |      Return the Categorical without null values.
 |      
 |      Missing values (-1 in .codes) are detected.
 |      
 |      Returns
 |      -------
 |      valid : Categorical
 |  
 |  equals(self, other)
 |      Returns True if categorical arrays are equal.
 |      
 |      Parameters
 |      ----------
 |      other : `Categorical`
 |      
 |      Returns
 |      -------
 |      are_equal : boolean
 |  
 |  fillna(self, value=None, method=None, limit=None)
 |      Fill NA/NaN values using the specified method.
 |      
 |      Parameters
 |      ----------
 |      value : scalar, dict, Series
 |          If a scalar value is passed it is used to fill all missing values.
 |          Alternatively, a Series or dict can be used to fill in different
 |          values for each index. The value should not be a list. The
 |          value(s) passed should either be in the categories or should be
 |          NaN.
 |      method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
 |          Method to use for filling holes in reindexed Series
 |          pad / ffill: propagate last valid observation forward to next valid
 |          backfill / bfill: use NEXT valid observation to fill gap
 |      limit : int, default None
 |          (Not implemented yet for Categorical!)
 |          If method is specified, this is the maximum number of consecutive
 |          NaN values to forward/backward fill. In other words, if there is
 |          a gap with more than this number of consecutive NaNs, it will only
 |          be partially filled. If method is not specified, this is the
 |          maximum number of entries along the entire axis where NaNs will be
 |          filled.
 |      
 |      Returns
 |      -------
 |      filled : Categorical with NA/NaN filled
 |  
 |  get_values(self)
 |      Return the values.
 |      
 |      For internal compatibility with pandas formatting.
 |      
 |      Returns
 |      -------
 |      values : numpy array
 |          A numpy array of the same dtype as categorical.categories.dtype or
 |          Index if datetime / periods
 |  
 |  is_dtype_equal(self, other)
 |      Returns True if categoricals are the same dtype
 |        same categories, and same ordered
 |      
 |      Parameters
 |      ----------
 |      other : Categorical
 |      
 |      Returns
 |      -------
 |      are_equal : boolean
 |  
 |  isin(self, values)
 |      Check whether `values` are contained in Categorical.
 |      
 |      Return a boolean NumPy Array showing whether each element in
 |      the Categorical matches an element in the passed sequence of
 |      `values` exactly.
 |      
 |      Parameters
 |      ----------
 |      values : set or list-like
 |          The sequence of values to test. Passing in a single string will
 |          raise a ``TypeError``. Instead, turn a single string into a
 |          list of one element.
 |      
 |      Returns
 |      -------
 |      isin : numpy.ndarray (bool dtype)
 |      
 |      Raises
 |      ------
 |      TypeError
 |        * If `values` is not a set or list-like
 |      
 |      See Also
 |      --------
 |      pandas.Series.isin : equivalent method on Series
 |      
 |      Examples
 |      --------
 |      
 |      >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama',
 |      ...                'hippo'])
 |      >>> s.isin(['cow', 'lama'])
 |      array([ True,  True,  True, False,  True, False])
 |      
 |      Passing a single string as ``s.isin('lama')`` will raise an error. Use
 |      a list of one element instead:
 |      
 |      >>> s.isin(['lama'])
 |      array([ True, False,  True, False,  True, False])
 |  
 |  isna(self)
 |      Detect missing values
 |      
 |      Missing values (-1 in .codes) are detected.
 |      
 |      Returns
 |      -------
 |      a boolean array of whether my values are null
 |      
 |      See also
 |      --------
 |      isna : top-level isna
 |      isnull : alias of isna
 |      Categorical.notna : boolean inverse of Categorical.isna
 |  
 |  isnull = isna(self)
 |  
 |  map(self, mapper)
 |      Map categories using input correspondence (dict, Series, or function).
 |      
 |      Maps the categories to new categories. If the mapping correspondence is
 |      one-to-one the result is a :class:`~pandas.Categorical` which has the
 |      same order property as the original, otherwise a :class:`~pandas.Index`
 |      is returned.
 |      
 |      If a `dict` or :class:`~pandas.Series` is used any unmapped category is
 |      mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`
 |      will be returned.
 |      
 |      Parameters
 |      ----------
 |      mapper : function, dict, or Series
 |          Mapping correspondence.
 |      
 |      Returns
 |      -------
 |      pandas.Categorical or pandas.Index
 |          Mapped categorical.
 |      
 |      See Also
 |      --------
 |      CategoricalIndex.map : Apply a mapping correspondence on a
 |          :class:`~pandas.CategoricalIndex`.
 |      Index.map : Apply a mapping correspondence on an
 |          :class:`~pandas.Index`.
 |      Series.map : Apply a mapping correspondence on a
 |          :class:`~pandas.Series`.
 |      Series.apply : Apply more complex functions on a
 |          :class:`~pandas.Series`.
 |      
 |      Examples
 |      --------
 |      >>> cat = pd.Categorical(['a', 'b', 'c'])
 |      >>> cat
 |      [a, b, c]
 |      Categories (3, object): [a, b, c]
 |      >>> cat.map(lambda x: x.upper())
 |      [A, B, C]
 |      Categories (3, object): [A, B, C]
 |      >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'})
 |      [first, second, third]
 |      Categories (3, object): [first, second, third]
 |      
 |      If the mapping is one-to-one the ordering of the categories is
 |      preserved:
 |      
 |      >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True)
 |      >>> cat
 |      [a, b, c]
 |      Categories (3, object): [a < b < c]
 |      >>> cat.map({'a': 3, 'b': 2, 'c': 1})
 |      [3, 2, 1]
 |      Categories (3, int64): [3 < 2 < 1]
 |      
 |      If the mapping is not one-to-one an :class:`~pandas.Index` is returned:
 |      
 |      >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'})
 |      Index(['first', 'second', 'first'], dtype='object')
 |      
 |      If a `dict` is used, all unmapped categories are mapped to `NaN` and
 |      the result is an :class:`~pandas.Index`:
 |      
 |      >>> cat.map({'a': 'first', 'b': 'second'})
 |      Index(['first', 'second', nan], dtype='object')
 |  
 |  max(self, numeric_only=None, **kwargs)
 |      The maximum value of the object.
 |      
 |      Only ordered `Categoricals` have a maximum!
 |      
 |      Raises
 |      ------
 |      TypeError
 |          If the `Categorical` is not `ordered`.
 |      
 |      Returns
 |      -------
 |      max : the maximum of this `Categorical`
 |  
 |  memory_usage(self, deep=False)
 |      Memory usage of my values
 |      
 |      Parameters
 |      ----------
 |      deep : bool
 |          Introspect the data deeply, interrogate
 |          `object` dtypes for system-level memory consumption
 |      
 |      Returns
 |      -------
 |      bytes used
 |      
 |      Notes
 |      -----
 |      Memory usage does not include memory consumed by elements that
 |      are not components of the array if deep=False
 |      
 |      See Also
 |      --------
 |      numpy.ndarray.nbytes
 |  
 |  min(self, numeric_only=None, **kwargs)
 |      The minimum value of the object.
 |      
 |      Only ordered `Categoricals` have a minimum!
 |      
 |      Raises
 |      ------
 |      TypeError
 |          If the `Categorical` is not `ordered`.
 |      
 |      Returns
 |      -------
 |      min : the minimum of this `Categorical`
 |  
 |  mode(self)
 |      Returns the mode(s) of the Categorical.
 |      
 |      Always returns `Categorical` even if only one value.
 |      
 |      Returns
 |      -------
 |      modes : `Categorical` (sorted)
 |  
 |  notna(self)
 |      Inverse of isna
 |      
 |      Both missing values (-1 in .codes) and NA as a category are detected as
 |      null.
 |      
 |      Returns
 |      -------
 |      a boolean array of whether my values are not null
 |      
 |      See also
 |      --------
 |      notna : top-level notna
 |      notnull : alias of notna
 |      Categorical.isna : boolean inverse of Categorical.notna
 |  
 |  notnull = notna(self)
 |  
 |  put(self, *args, **kwargs)
 |      Replace specific elements in the Categorical with given values.
 |  
 |  ravel(self, order='C')
 |      Return a flattened (numpy) array.
 |      
 |      For internal compatibility with numpy arrays.
 |      
 |      Returns
 |      -------
 |      raveled : numpy array
 |  
 |  remove_categories(self, removals, inplace=False)
 |      Removes the specified categories.
 |      
 |      `removals` must be included in the old categories. Values which were in
 |      the removed categories will be set to NaN
 |      
 |      Raises
 |      ------
 |      ValueError
 |          If the removals are not contained in the categories
 |      
 |      Parameters
 |      ----------
 |      removals : category or list of categories
 |         The categories which should be removed.
 |      inplace : boolean (default: False)
 |         Whether or not to remove the categories inplace or return a copy of
 |         this categorical with removed categories.
 |      
 |      Returns
 |      -------
 |      cat : Categorical with removed categories or None if inplace.
 |      
 |      See also
 |      --------
 |      rename_categories
 |      reorder_categories
 |      add_categories
 |      remove_unused_categories
 |      set_categories
 |  
 |  remove_unused_categories(self, inplace=False)
 |      Removes categories which are not used.
 |      
 |      Parameters
 |      ----------
 |      inplace : boolean (default: False)
 |         Whether or not to drop unused categories inplace or return a copy of
 |         this categorical with unused categories dropped.
 |      
 |      Returns
 |      -------
 |      cat : Categorical with unused categories dropped or None if inplace.
 |      
 |      See also
 |      --------
 |      rename_categories
 |      reorder_categories
 |      add_categories
 |      remove_categories
 |      set_categories
 |  
 |  rename_categories(self, new_categories, inplace=False)
 |      Renames categories.
 |      
 |      Raises
 |      ------
 |      ValueError
 |          If new categories are list-like and do not have the same number of
 |          items than the current categories or do not validate as categories
 |      
 |      Parameters
 |      ----------
 |      new_categories : list-like, dict-like or callable
 |      
 |         * list-like: all items must be unique and the number of items in
 |           the new categories must match the existing number of categories.
 |      
 |         * dict-like: specifies a mapping from
 |           old categories to new. Categories not contained in the mapping
 |           are passed through and extra categories in the mapping are
 |           ignored.
 |      
 |           .. versionadded:: 0.21.0
 |      
 |         * callable : a callable that is called on all items in the old
 |           categories and whose return values comprise the new categories.
 |      
 |           .. versionadded:: 0.23.0
 |      
 |         .. warning::
 |      
 |            Currently, Series are considered list like. In a future version
 |            of pandas they'll be considered dict-like.
 |      
 |      inplace : boolean (default: False)
 |         Whether or not to rename the categories inplace or return a copy of
 |         this categorical with renamed categories.
 |      
 |      Returns
 |      -------
 |      cat : Categorical or None
 |         With ``inplace=False``, the new categorical is returned.
 |         With ``inplace=True``, there is no return value.
 |      
 |      See also
 |      --------
 |      reorder_categories
 |      add_categories
 |      remove_categories
 |      remove_unused_categories
 |      set_categories
 |      
 |      Examples
 |      --------
 |      >>> c = Categorical(['a', 'a', 'b'])
 |      >>> c.rename_categories([0, 1])
 |      [0, 0, 1]
 |      Categories (2, int64): [0, 1]
 |      
 |      For dict-like ``new_categories``, extra keys are ignored and
 |      categories not in the dictionary are passed through
 |      
 |      >>> c.rename_categories({'a': 'A', 'c': 'C'})
 |      [A, A, b]
 |      Categories (2, object): [A, b]
 |      
 |      You may also provide a callable to create the new categories
 |      
 |      >>> c.rename_categories(lambda x: x.upper())
 |      [A, A, B]
 |      Categories (2, object): [A, B]
 |  
 |  reorder_categories(self, new_categories, ordered=None, inplace=False)
 |      Reorders categories as specified in new_categories.
 |      
 |      `new_categories` need to include all old categories and no new category
 |      items.
 |      
 |      Raises
 |      ------
 |      ValueError
 |          If the new categories do not contain all old category items or any
 |          new ones
 |      
 |      Parameters
 |      ----------
 |      new_categories : Index-like
 |         The categories in new order.
 |      ordered : boolean, optional
 |         Whether or not the categorical is treated as a ordered categorical.
 |         If not given, do not change the ordered information.
 |      inplace : boolean (default: False)
 |         Whether or not to reorder the categories inplace or return a copy of
 |         this categorical with reordered categories.
 |      
 |      Returns
 |      -------
 |      cat : Categorical with reordered categories or None if inplace.
 |      
 |      See also
 |      --------
 |      rename_categories
 |      add_categories
 |      remove_categories
 |      remove_unused_categories
 |      set_categories
 |  
 |  repeat(self, repeats, *args, **kwargs)
 |      Repeat elements of a Categorical.
 |      
 |      See also
 |      --------
 |      numpy.ndarray.repeat
 |  
 |  searchsorted(self, value, side='left', sorter=None)
 |      Find indices where elements should be inserted to maintain order.
 |      
 |      Find the indices into a sorted Categorical `self` such that, if the
 |      corresponding elements in `value` were inserted before the indices,
 |      the order of `self` would be preserved.
 |      
 |      Parameters
 |      ----------
 |      value : array_like
 |          Values to insert into `self`.
 |      side : {'left', 'right'}, optional
 |          If 'left', the index of the first suitable location found is given.
 |          If 'right', return the last such index.  If there is no suitable
 |          index, return either 0 or N (where N is the length of `self`).
 |      sorter : 1-D array_like, optional
 |          Optional array of integer indices that sort `self` into ascending
 |          order. They are typically the result of ``np.argsort``.
 |      
 |      Returns
 |      -------
 |      indices : array of ints
 |          Array of insertion points with the same shape as `value`.
 |      
 |      See Also
 |      --------
 |      numpy.searchsorted
 |      
 |      Notes
 |      -----
 |      Binary search is used to find the required insertion points.
 |      
 |      Examples
 |      --------
 |      
 |      >>> x = pd.Series([1, 2, 3])
 |      >>> x
 |      0    1
 |      1    2
 |      2    3
 |      dtype: int64
 |      
 |      >>> x.searchsorted(4)
 |      array([3])
 |      
 |      >>> x.searchsorted([0, 4])
 |      array([0, 3])
 |      
 |      >>> x.searchsorted([1, 3], side='left')
 |      array([0, 2])
 |      
 |      >>> x.searchsorted([1, 3], side='right')
 |      array([1, 3])
 |      
 |      >>> x = pd.Categorical(['apple', 'bread', 'bread',
 |                              'cheese', 'milk'], ordered=True)
 |      [apple, bread, bread, cheese, milk]
 |      Categories (4, object): [apple < bread < cheese < milk]
 |      
 |      >>> x.searchsorted('bread')
 |      array([1])     # Note: an array, not a scalar
 |      
 |      >>> x.searchsorted(['bread'], side='right')
 |      array([3])
 |  
 |  set_categories(self, new_categories, ordered=None, rename=False, inplace=False)
 |      Sets the categories to the specified new_categories.
 |      
 |      `new_categories` can include new categories (which will result in
 |      unused categories) or remove old categories (which results in values
 |      set to NaN). If `rename==True`, the categories will simple be renamed
 |      (less or more items than in old categories will result in values set to
 |      NaN or in unused categories respectively).
 |      
 |      This method can be used to perform more than one action of adding,
 |      removing, and reordering simultaneously and is therefore faster than
 |      performing the individual steps via the more specialised methods.
 |      
 |      On the other hand this methods does not do checks (e.g., whether the
 |      old categories are included in the new categories on a reorder), which
 |      can result in surprising changes, for example when using special string
 |      dtypes on python3, which does not considers a S1 string equal to a
 |      single char python string.
 |      
 |      Raises
 |      ------
 |      ValueError
 |          If new_categories does not validate as categories
 |      
 |      Parameters
 |      ----------
 |      new_categories : Index-like
 |         The categories in new order.
 |      ordered : boolean, (default: False)
 |         Whether or not the categorical is treated as a ordered categorical.
 |         If not given, do not change the ordered information.
 |      rename : boolean (default: False)
 |         Whether or not the new_categories should be considered as a rename
 |         of the old categories or as reordered categories.
 |      inplace : boolean (default: False)
 |         Whether or not to reorder the categories inplace or return a copy of
 |         this categorical with reordered categories.
 |      
 |      Returns
 |      -------
 |      cat : Categorical with reordered categories or None if inplace.
 |      
 |      See also
 |      --------
 |      rename_categories
 |      reorder_categories
 |      add_categories
 |      remove_categories
 |      remove_unused_categories
 |  
 |  set_ordered(self, value, inplace=False)
 |      Sets the ordered attribute to the boolean value
 |      
 |      Parameters
 |      ----------
 |      value : boolean to set whether this categorical is ordered (True) or
 |         not (False)
 |      inplace : boolean (default: False)
 |         Whether or not to set the ordered attribute inplace or return a copy
 |         of this categorical with ordered set to the value
 |  
 |  shift(self, periods)
 |      Shift Categorical by desired number of periods.
 |      
 |      Parameters
 |      ----------
 |      periods : int
 |          Number of periods to move, can be positive or negative
 |      
 |      Returns
 |      -------
 |      shifted : Categorical
 |  
 |  sort_values(self, inplace=False, ascending=True, na_position='last')
 |      Sorts the Categorical by category value returning a new
 |      Categorical by default.
 |      
 |      While an ordering is applied to the category values, sorting in this
 |      context refers more to organizing and grouping together based on
 |      matching category values. Thus, this function can be called on an
 |      unordered Categorical instance unlike the functions 'Categorical.min'
 |      and 'Categorical.max'.
 |      
 |      Parameters
 |      ----------
 |      inplace : boolean, default False
 |          Do operation in place.
 |      ascending : boolean, default True
 |          Order ascending. Passing False orders descending. The
 |          ordering parameter provides the method by which the
 |          category values are organized.
 |      na_position : {'first', 'last'} (optional, default='last')
 |          'first' puts NaNs at the beginning
 |          'last' puts NaNs at the end
 |      
 |      Returns
 |      -------
 |      y : Categorical or None
 |      
 |      See Also
 |      --------
 |      Categorical.sort
 |      Series.sort_values
 |      
 |      Examples
 |      --------
 |      >>> c = pd.Categorical([1, 2, 2, 1, 5])
 |      >>> c
 |      [1, 2, 2, 1, 5]
 |      Categories (3, int64): [1, 2, 5]
 |      >>> c.sort_values()
 |      [1, 1, 2, 2, 5]
 |      Categories (3, int64): [1, 2, 5]
 |      >>> c.sort_values(ascending=False)
 |      [5, 2, 2, 1, 1]
 |      Categories (3, int64): [1, 2, 5]
 |      
 |      Inplace sorting can be done as well:
 |      
 |      >>> c.sort_values(inplace=True)
 |      >>> c
 |      [1, 1, 2, 2, 5]
 |      Categories (3, int64): [1, 2, 5]
 |      >>>
 |      >>> c = pd.Categorical([1, 2, 2, 1, 5])
 |      
 |      'sort_values' behaviour with NaNs. Note that 'na_position'
 |      is independent of the 'ascending' parameter:
 |      
 |      >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5])
 |      >>> c
 |      [NaN, 2.0, 2.0, NaN, 5.0]
 |      Categories (2, int64): [2, 5]
 |      >>> c.sort_values()
 |      [2.0, 2.0, 5.0, NaN, NaN]
 |      Categories (2, int64): [2, 5]
 |      >>> c.sort_values(ascending=False)
 |      [5.0, 2.0, 2.0, NaN, NaN]
 |      Categories (2, int64): [2, 5]
 |      >>> c.sort_values(na_position='first')
 |      [NaN, NaN, 2.0, 2.0, 5.0]
 |      Categories (2, int64): [2, 5]
 |      >>> c.sort_values(ascending=False, na_position='first')
 |      [NaN, NaN, 5.0, 2.0, 2.0]
 |      Categories (2, int64): [2, 5]
 |  
 |  take = take_nd(self, indexer, allow_fill=None, fill_value=None)
 |  
 |  take_nd(self, indexer, allow_fill=None, fill_value=None)
 |      Take elements from the Categorical.
 |      
 |      Parameters
 |      ----------
 |      indexer : sequence of integers
 |      allow_fill : bool, default None.
 |          How to handle negative values in `indexer`.
 |      
 |          * False: negative values in `indices` indicate positional indices
 |            from the right. This is similar to
 |            :func:`numpy.take`.
 |      
 |          * True: negative values in `indices` indicate missing values
 |            (the default). These values are set to `fill_value`. Any other
 |            other negative values raise a ``ValueError``.
 |      
 |          .. versionchanged:: 0.23.0
 |      
 |             Deprecated the default value of `allow_fill`. The deprecated
 |             default is ``True``. In the future, this will change to
 |             ``False``.
 |      
 |      Returns
 |      -------
 |      Categorical
 |          This Categorical will have the same categories and ordered as
 |          `self`.
 |  
 |  to_dense(self)
 |      Return my 'dense' representation
 |      
 |      For internal compatibility with numpy arrays.
 |      
 |      Returns
 |      -------
 |      dense : array
 |  
 |  tolist(self)
 |      Return a list of the values.
 |      
 |      These are each a scalar type, which is a Python scalar
 |      (for str, int, float) or a pandas scalar
 |      (for Timestamp/Timedelta/Interval/Period)
 |  
 |  unique(self)
 |      Return the ``Categorical`` which ``categories`` and ``codes`` are
 |      unique. Unused categories are NOT returned.
 |      
 |      - unordered category: values and categories are sorted by appearance
 |        order.
 |      - ordered category: values are sorted by appearance order, categories
 |        keeps existing order.
 |      
 |      Returns
 |      -------
 |      unique values : ``Categorical``
 |      
 |      Examples
 |      --------
 |      An unordered Categorical will return categories in the
 |      order of appearance.
 |      
 |      >>> pd.Categorical(list('baabc'))
 |      [b, a, c]
 |      Categories (3, object): [b, a, c]
 |      
 |      >>> pd.Categorical(list('baabc'), categories=list('abc'))
 |      [b, a, c]
 |      Categories (3, object): [b, a, c]
 |      
 |      An ordered Categorical preserves the category ordering.
 |      
 |      >>> pd.Categorical(list('baabc'),
 |      ...                categories=list('abc'),
 |      ...                ordered=True)
 |      [b, a, c]
 |      Categories (3, object): [a < b < c]
 |      
 |      See Also
 |      --------
 |      unique
 |      CategoricalIndex.unique
 |      Series.unique
 |  
 |  value_counts(self, dropna=True)
 |      Returns a Series containing counts of each category.
 |      
 |      Every category will have an entry, even those with a count of 0.
 |      
 |      Parameters
 |      ----------
 |      dropna : boolean, default True
 |          Don't include counts of NaN.
 |      
 |      Returns
 |      -------
 |      counts : Series
 |      
 |      See Also
 |      --------
 |      Series.value_counts
 |  
 |  view(self)
 |      Return a view of myself.
 |      
 |      For internal compatibility with numpy arrays.
 |      
 |      Returns
 |      -------
 |      view : Categorical
 |         Returns `self`!
 |  
 |  ----------------------------------------------------------------------
 |  Class methods defined here:
 |  
 |  from_codes(codes, categories, ordered=False) from builtins.type
 |      Make a Categorical type from codes and categories arrays.
 |      
 |      This constructor is useful if you already have codes and categories and
 |      so do not need the (computation intensive) factorization step, which is
 |      usually done on the constructor.
 |      
 |      If your data does not follow this convention, please use the normal
 |      constructor.
 |      
 |      Parameters
 |      ----------
 |      codes : array-like, integers
 |          An integer array, where each integer points to a category in
 |          categories or -1 for NaN
 |      categories : index-like
 |          The categories for the categorical. Items need to be unique.
 |      ordered : boolean, (default False)
 |          Whether or not this categorical is treated as a ordered
 |          categorical. If not given, the resulting categorical will be
 |          unordered.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  T
 |  
 |  base
 |      compat, we are always our own object
 |  
 |  categories
 |      The categories of this categorical.
 |      
 |      Setting assigns new values to each category (effectively a rename of
 |      each individual category).
 |      
 |      The assigned value has to be a list-like object. All items must be
 |      unique and the number of items in the new categories must be the same
 |      as the number of items in the old categories.
 |      
 |      Assigning to `categories` is a inplace operation!
 |      
 |      Raises
 |      ------
 |      ValueError
 |          If the new categories do not validate as categories or if the
 |          number of new categories is unequal the number of old categories
 |      
 |      See also
 |      --------
 |      rename_categories
 |      reorder_categories
 |      add_categories
 |      remove_categories
 |      remove_unused_categories
 |      set_categories
 |  
 |  codes
 |      The category codes of this categorical.
 |      
 |      Level codes are an array if integer which are the positions of the real
 |      values in the categories array.
 |      
 |      There is not setter, use the other categorical methods and the normal item
 |      setter to change values in the categorical.
 |  
 |  dtype
 |      The :class:`~pandas.api.types.CategoricalDtype` for this instance
 |  
 |  itemsize
 |      return the size of a single category
 |  
 |  nbytes
 |      The number of bytes needed to store this object in memory.
 |  
 |  ndim
 |      Number of dimensions of the Categorical
 |  
 |  ordered
 |      Whether the categories have an ordered relationship
 |  
 |  shape
 |      Shape of the Categorical.
 |      
 |      For internal compatibility with numpy arrays.
 |      
 |      Returns
 |      -------
 |      shape : tuple
 |  
 |  size
 |      return the len of myself
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  __array_priority__ = 1000
 |  
 |  __hash__ = None
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pandas.core.arrays.base.ExtensionArray:
 |  
 |  factorize(self, na_sentinel=-1)
 |      Encode the extension array as an enumerated type.
 |      
 |      Parameters
 |      ----------
 |      na_sentinel : int, default -1
 |          Value to use in the `labels` array to indicate missing values.
 |      
 |      Returns
 |      -------
 |      labels : ndarray
 |          An integer NumPy array that's an indexer into the original
 |          ExtensionArray.
 |      uniques : ExtensionArray
 |          An ExtensionArray containing the unique values of `self`.
 |      
 |          .. note::
 |      
 |             uniques will *not* contain an entry for the NA value of
 |             the ExtensionArray if there are any missing values present
 |             in `self`.
 |      
 |      See Also
 |      --------
 |      pandas.factorize : Top-level factorize method that dispatches here.
 |      
 |      Notes
 |      -----
 |      :meth:`pandas.factorize` offers a `sort` keyword as well.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from pandas.core.arrays.base.ExtensionArray:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pandas.core.base.PandasObject:
 |  
 |  __sizeof__(self)
 |      Generates the total memory usage for an object that returns
 |      either a value or Series of values
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pandas.core.base.StringMixin:
 |  
 |  __bytes__(self)
 |      Return a string representation for a particular object.
 |      
 |      Invoked by bytes(obj) in py3 only.
 |      Yields a bytestring in both py2/py3.
 |  
 |  __repr__(self)
 |      Return a string representation for a particular object.
 |      
 |      Yields Bytestring in Py2, Unicode String in py3.
 |  
 |  __str__(self)
 |      Return a string representation for a particular Object
 |      
 |      Invoked by str(df) in both py2/py3.
 |      Yields Bytestring in Py2, Unicode String in py3.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from pandas.core.accessor.DirNamesMixin:
 |  
 |  __dir__(self)
 |      Provide method name lookup and completion
 |      Only provide 'public' methods
cat = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
cat
[a, b, c, a, b, c]
Categories (3, object): [a, b, c]
cat = cat=pd.Categorical(['a','b','c','a','b','c','d'], ['c', 'b', 'a'])
cat
[a, b, c, a, b, c, NaN]
Categories (3, object): [c, b, a]
cat = cat=pd.Categorical(['a','b','c','a','b','c','d'], ['c', 'b', 'a'],ordered=True)
cat
[a, b, c, a, b, c, NaN]
Categories (3, object): [c < b < a]
cat = pd.Categorical(["a", "c", "c", np.nan], categories=["b", "a", "c"])
df = pd.DataFrame({"cat":cat, "s":["a", "c", "c", np.nan]})
df.describe()
cat s
count 3 3
unique 2 2
top c c
freq 2 2
df["cat"].describe()
count     3
unique    2
top       c
freq      2
Name: cat, dtype: object
s = pd.Categorical(["a", "c", "c", np.nan], categories=["b", "a", "c"])
s.categories
Index(['b', 'a', 'c'], dtype='object')
cat = pd.Categorical(["a", "c", "c", np.nan], categories=["b", "a", "c"])
cat.ordered
False
cat = pd.Categorical(["a", "c", "c", np.nan], categories=["b", "a", "c"], ordered=True)
cat.ordered
True
s = pd.Series(["a","b","c","a"], dtype="category")
s.cat
<pandas.core.arrays.categorical.CategoricalAccessor object at 0x00000223285166A0>
s.cat.categories
Index(['a', 'b', 'c'], dtype='object')
s.cat.categories = ["Group %s" % g for g in s.cat.categories]
s.cat.categories
Index(['Group a', 'Group b', 'Group c'], dtype='object')
s
0    Group a
1    Group b
2    Group c
3    Group a
dtype: category
Categories (3, object): [Group a, Group b, Group c]
s = pd.Series(["a","b","c","a"], dtype="category")
s = s.cat.add_categories([4])
s
0    a
1    b
2    c
3    a
dtype: category
Categories (4, object): [a, b, c, 4]
s = pd.Series(["a","b","c","a"], dtype="category")
s.cat.remove_categories('a')
0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (2, object): [b, c]
s
0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]
cat = pd.Series([1,2,3]).astype("category", categories=[1,2,3], ordered=True)
cat1 = pd.Series([2,2,2]).astype("category", categories=[1,2,3], ordered=True)
cat > cat1
C:Analibsite-packagesipykernel_launcher.py:1: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  """Entry point for launching an IPython kernel.
C:Analibsite-packagesipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  





0    False
1    False
2     True
dtype: bool
cat = pd.Series([1,2,3]).astype("category", categories=[1,2,3], ordered=True)
cat1 = pd.Series([2,2,2]).astype("category", categories=[1,2,3])
cat > cat1
C:Analibsite-packagesipykernel_launcher.py:1: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  """Entry point for launching an IPython kernel.
C:Analibsite-packagesipykernel_launcher.py:2: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  



---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-27-0394598daed0> in <module>
      1 cat = pd.Series([1,2,3]).astype("category", categories=[1,2,3], ordered=True)
      2 cat1 = pd.Series([2,2,2]).astype("category", categories=[1,2,3])
----> 3 cat > cat1


C:Analibsite-packagespandascoreops.py in wrapper(self, other, axis)
   1194             # Dispatch to Categorical implementation; pd.CategoricalIndex
   1195             # behavior is non-canonical GH#19513
-> 1196             res_values = dispatch_to_index_op(op, self, other, pd.Categorical)
   1197             return self._constructor(res_values, index=self.index,
   1198                                      name=res_name)


C:Analibsite-packagespandascoreops.py in dispatch_to_index_op(op, left, right, index_class)
   1099         left_idx = left_idx._shallow_copy(freq=None)
   1100     try:
-> 1101         result = op(left_idx, right)
   1102     except NullFrequencyError:
   1103         # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError


C:Analibsite-packagespandascoreops.py in wrapper(self, other, axis)
   1194             # Dispatch to Categorical implementation; pd.CategoricalIndex
   1195             # behavior is non-canonical GH#19513
-> 1196             res_values = dispatch_to_index_op(op, self, other, pd.Categorical)
   1197             return self._constructor(res_values, index=self.index,
   1198                                      name=res_name)


C:Analibsite-packagespandascoreops.py in dispatch_to_index_op(op, left, right, index_class)
   1099         left_idx = left_idx._shallow_copy(freq=None)
   1100     try:
-> 1101         result = op(left_idx, right)
   1102     except NullFrequencyError:
   1103         # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError


C:Analibsite-packagespandascorearrayscategorical.py in f(self, other)
     73         if not self.ordered:
     74             if op in ['__lt__', '__gt__', '__le__', '__ge__']:
---> 75                 raise TypeError("Unordered Categoricals can only compare "
     76                                 "equality or not")
     77         if isinstance(other, Categorical):


TypeError: Unordered Categoricals can only compare equality or not

Visualization

df = pd.DataFrame(np.random.randn(10,4),index=pd.date_range('1/1/2000',
   periods=10), columns=list('ABCD'))
df
A B C D
2000-01-01 0.931528 0.805856 -0.197488 1.169110
2000-01-02 -0.091044 -1.114388 -0.911058 -0.666875
2000-01-03 -0.667976 -0.912989 0.063826 -1.145067
2000-01-04 -0.321401 0.823028 -0.013197 0.117705
2000-01-05 0.017434 0.727450 -0.741326 -0.021172
2000-01-06 0.618563 -1.015479 -0.555008 0.217190
2000-01-07 -0.419080 1.461645 0.093773 -0.012262
2000-01-08 -1.247346 0.732218 0.700437 0.974215
2000-01-09 0.134827 1.741333 -0.218103 -1.147648
2000-01-10 -0.753395 -0.457911 0.071057 1.674153
df.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x16356c17d30>

在这里插入图片描述

bar

df = pd.DataFrame(np.random.rand(10,4),columns=['a','b','c','d'])
df.plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x16357ce94a8>

在这里插入图片描述

df.plot.bar(stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x16357e1cc18>

在这里插入图片描述

df.plot.barh(stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x16357f08080>

在这里插入图片描述

Histograms

df = pd.DataFrame({'a':np.random.randn(1000)+1,'b':np.random.randn(1000),'c':
np.random.randn(1000) - 1}, columns=['a', 'b', 'c'])
df.plot.hist(bins=20)
<matplotlib.axes._subplots.AxesSubplot at 0x16357de2470>

在这里插入图片描述

Box Plots

df = pd.DataFrame(np.random.rand(10, 5), columns=['A', 'B', 'C', 'D', 'E'])
df.plot.box()
<matplotlib.axes._subplots.AxesSubplot at 0x16357ea3b00>

在这里插入图片描述

Area Plot

df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])
df.plot.area()
<matplotlib.axes._subplots.AxesSubplot at 0x1635830d8d0>

在这里插入图片描述

df.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1635846fa58>

在这里插入图片描述

Scatter Plot

df = pd.DataFrame(np.random.rand(50, 4), columns=['a', 'b', 'c', 'd'])
df.plot.scatter(x='a', y='b')
<matplotlib.axes._subplots.AxesSubplot at 0x1635876bc18>

在这里插入图片描述

Pie Plot

df = pd.DataFrame(3 * np.random.rand(4), index=['a', 'b', 'c', 'd'], columns=['x'])
df.plot.pie(subplots=True)
array([<matplotlib.axes._subplots.AxesSubplot object at 0x0000016358524390>],
      dtype=object)

在这里插入图片描述

IO Tools pd.read_csv

df = pd.read_csv("C:/Users/pkavs/Desktop/temp.csv")
df
S.No Name Age City Salary
0 1 Tom 28 Toronto 20000
1 2 Lee 32 HongKong 3000
2 3 Steven 43 Bay Area 8300
3 4 Ram 38 Hyderabad 3900
df = pd.read_csv("C:/Users/pkavs/Desktop/temp.csv", index_col=['S.No'])
df
Name Age City Salary
S.No
1 Tom 28 Toronto 20000
2 Lee 32 HongKong 3000
3 Steven 43 Bay Area 8300
4 Ram 38 Hyderabad 3900
df = pd.read_csv("C:/Users/pkavs/Desktop/temp.csv", dtype={'Salary': np.float64})  #特别设置数据属性
df
S.No Name Age City Salary
0 1 Tom 28 Toronto 20000.0
1 2 Lee 32 HongKong 3000.0
2 3 Steven 43 Bay Area 8300.0
3 4 Ram 38 Hyderabad 3900.0
df = pd.read_csv("C:/Users/pkavs/Desktop/temp.csv", names=['a', 'b', 'c', 'd', 'e']) #设置行名
df
a b c d e
0 S.No Name Age City Salary
1 1 Tom 28 Toronto 20000
2 2 Lee 32 HongKong 3000
3 3 Steven 43 Bay Area 8300
4 4 Ram 38 Hyderabad 3900
df = pd.read_csv("C:/Users/pkavs/Desktop/temp.csv", names=['a', 'b', 'c', 'd', 'e'], header=0)#省略表头
df
a b c d e
0 1 Tom 28 Toronto 20000
1 2 Lee 32 HongKong 3000
2 3 Steven 43 Bay Area 8300
3 4 Ram 38 Hyderabad 3900
df = pd.read_csv("C:/Users/pkavs/Desktop/temp.csv", skiprows=2)#跳过行数
df
2 Lee 32 HongKong 3000
0 3 Steven 43 Bay Area 8300
1 4 Ram 38 Hyderabad 3900

Sparse Data

ts = pd.Series(np.random.randn(10))
ts[2:-2] = np.nan
sts = ts.to_sparse()
sts  # Block locations 非空从0开始 或者从8开始, 长度都是2
0    0.514531
1   -1.216632
2         NaN
3         NaN
4         NaN
5         NaN
6         NaN
7         NaN
8    0.130844
9   -0.723086
dtype: float64
BlockIndex
Block locations: array([0, 8])
Block lengths: array([2, 2])
sts.density #  4 / 10 = 0.4  密度? 非空数据的占比
0.4
sts.to_dense()
0    0.514531
1   -1.216632
2         NaN
3         NaN
4         NaN
5         NaN
6         NaN
7         NaN
8    0.130844
9   -0.723086
dtype: float64
sts.sp_values
array([ 0.51453087, -1.21663236,  0.13084413, -0.72308557])
sts.sp_index
BlockIndex
Block locations: array([0, 8])
Block lengths: array([2, 2])

sparse dtypes:

np.nan 是 float64, 0是int64, False 是bool型

.any(), .all(), .item(), .bool()

if pd.Series([False, True, False]).any():
   print("I am any")
I am any
if pd.Series([False, True, False]).all():
   print("I am any")
else:
    print("I am all")
I am all
pd.Series([False]).bool()
False
help(pd.Series([False]).bool)
Help on method bool in module pandas.core.generic:

bool() method of pandas.core.series.Series instance
    Return the bool of a single element PandasObject.
    
    This must be a boolean scalar value, either True or False.  Raise a
    ValueError if the PandasObject does not have exactly 1 element, or that
    element is not boolean
原文地址:https://www.cnblogs.com/MTandHJ/p/11632526.html