In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

Object Creation

In [2]:
s = pd.Series([1,3,5,np.nan,6,8])
s
Out[2]:
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
In [3]:
dates = pd.date_range('20180930', periods=6)
dates
Out[3]:
DatetimeIndex(['2018-09-30', '2018-10-01', '2018-10-02', '2018-10-03',
               '2018-10-04', '2018-10-05'],
              dtype='datetime64[ns]', freq='D')
In [4]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df
Out[4]:
A B C D
2018-09-30 0.236689 0.049317 1.219101 2.583254
2018-10-01 -0.535449 -0.637367 -1.368044 1.035176
2018-10-02 -0.989171 0.659536 0.637838 0.176925
2018-10-03 0.935130 -0.446711 -0.681937 1.814646
2018-10-04 0.368753 0.272398 -0.934959 1.031892
2018-10-05 0.683708 -1.866773 1.151411 -0.867367
In [5]:
df2 = pd.DataFrame({ 'A' : 1.,
                    'B' : pd.Timestamp('20180930'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })
df2
Out[5]:
A B C D E F
0 1.0 2018-09-30 1.0 3 test foo
1 1.0 2018-09-30 1.0 3 train foo
2 1.0 2018-09-30 1.0 3 test foo
3 1.0 2018-09-30 1.0 3 train foo
In [6]:
df2.dtypes
Out[6]:
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
In [7]:
#df2.<TAB>

Viewing Data

In [8]:
df.head()
Out[8]:
A B C D
2018-09-30 0.236689 0.049317 1.219101 2.583254
2018-10-01 -0.535449 -0.637367 -1.368044 1.035176
2018-10-02 -0.989171 0.659536 0.637838 0.176925
2018-10-03 0.935130 -0.446711 -0.681937 1.814646
2018-10-04 0.368753 0.272398 -0.934959 1.031892
In [9]:
df.tail(3)
Out[9]:
A B C D
2018-10-03 0.935130 -0.446711 -0.681937 1.814646
2018-10-04 0.368753 0.272398 -0.934959 1.031892
2018-10-05 0.683708 -1.866773 1.151411 -0.867367
In [10]:
df.index
Out[10]:
DatetimeIndex(['2018-09-30', '2018-10-01', '2018-10-02', '2018-10-03',
               '2018-10-04', '2018-10-05'],
              dtype='datetime64[ns]', freq='D')
In [11]:
df.columns
Out[11]:
Index(['A', 'B', 'C', 'D'], dtype='object')
In [12]:
df.values
Out[12]:
array([[ 0.23668901,  0.04931661,  1.21910064,  2.58325361],
       [-0.53544914, -0.63736727, -1.36804416,  1.03517569],
       [-0.98917112,  0.65953558,  0.63783813,  0.17692478],
       [ 0.93513023, -0.44671139, -0.68193729,  1.81464572],
       [ 0.36875252,  0.27239755, -0.93495857,  1.03189194],
       [ 0.68370779, -1.86677263,  1.15141131, -0.86736677]])
In [13]:
df.describe()
Out[13]:
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.116610 -0.328267 0.003902 0.962421
std 0.737225 0.889460 1.133969 1.210672
min -0.989171 -1.866773 -1.368044 -0.867367
25% -0.342415 -0.589703 -0.871703 0.390667
50% 0.302721 -0.198697 -0.022050 1.033534
75% 0.604969 0.216627 1.023018 1.619778
max 0.935130 0.659536 1.219101 2.583254
In [14]:
df.T
Out[14]:
2018-09-30 00:00:00 2018-10-01 00:00:00 2018-10-02 00:00:00 2018-10-03 00:00:00 2018-10-04 00:00:00 2018-10-05 00:00:00
A 0.236689 -0.535449 -0.989171 0.935130 0.368753 0.683708
B 0.049317 -0.637367 0.659536 -0.446711 0.272398 -1.866773
C 1.219101 -1.368044 0.637838 -0.681937 -0.934959 1.151411
D 2.583254 1.035176 0.176925 1.814646 1.031892 -0.867367
In [15]:
df.sort_index(axis=1, ascending=False)
Out[15]:
D C B A
2018-09-30 2.583254 1.219101 0.049317 0.236689
2018-10-01 1.035176 -1.368044 -0.637367 -0.535449
2018-10-02 0.176925 0.637838 0.659536 -0.989171
2018-10-03 1.814646 -0.681937 -0.446711 0.935130
2018-10-04 1.031892 -0.934959 0.272398 0.368753
2018-10-05 -0.867367 1.151411 -1.866773 0.683708
In [16]:
df.sort_values(by='B')
Out[16]:
A B C D
2018-10-05 0.683708 -1.866773 1.151411 -0.867367
2018-10-01 -0.535449 -0.637367 -1.368044 1.035176
2018-10-03 0.935130 -0.446711 -0.681937 1.814646
2018-09-30 0.236689 0.049317 1.219101 2.583254
2018-10-04 0.368753 0.272398 -0.934959 1.031892
2018-10-02 -0.989171 0.659536 0.637838 0.176925

Selection

In [17]:
df['A'] #df.A
Out[17]:
2018-09-30    0.236689
2018-10-01   -0.535449
2018-10-02   -0.989171
2018-10-03    0.935130
2018-10-04    0.368753
2018-10-05    0.683708
Freq: D, Name: A, dtype: float64
In [18]:
df[0:3]
Out[18]:
A B C D
2018-09-30 0.236689 0.049317 1.219101 2.583254
2018-10-01 -0.535449 -0.637367 -1.368044 1.035176
2018-10-02 -0.989171 0.659536 0.637838 0.176925
In [19]:
df['20181001':'20181002']
Out[19]:
A B C D
2018-10-01 -0.535449 -0.637367 -1.368044 1.035176
2018-10-02 -0.989171 0.659536 0.637838 0.176925

Selection by Label

In [20]:
df.head()
Out[20]:
A B C D
2018-09-30 0.236689 0.049317 1.219101 2.583254
2018-10-01 -0.535449 -0.637367 -1.368044 1.035176
2018-10-02 -0.989171 0.659536 0.637838 0.176925
2018-10-03 0.935130 -0.446711 -0.681937 1.814646
2018-10-04 0.368753 0.272398 -0.934959 1.031892
In [21]:
df.loc[dates[0]]
Out[21]:
A    0.236689
B    0.049317
C    1.219101
D    2.583254
Name: 2018-09-30 00:00:00, dtype: float64
In [22]:
df.loc[:,['A','B']]
Out[22]:
A B
2018-09-30 0.236689 0.049317
2018-10-01 -0.535449 -0.637367
2018-10-02 -0.989171 0.659536
2018-10-03 0.935130 -0.446711
2018-10-04 0.368753 0.272398
2018-10-05 0.683708 -1.866773
In [23]:
df.loc['20181001':'20181002',['A','B']]
Out[23]:
A B
2018-10-01 -0.535449 -0.637367
2018-10-02 -0.989171 0.659536
In [24]:
df.loc['20181002',['A','B']]
Out[24]:
A   -0.989171
B    0.659536
Name: 2018-10-02 00:00:00, dtype: float64
In [25]:
df.loc[dates[0],'A']
Out[25]:
0.2366890083832509
In [26]:
df.at[dates[0],'A']
Out[26]:
0.2366890083832509

Selection by Position

In [27]:
df.iloc[3]
Out[27]:
A    0.935130
B   -0.446711
C   -0.681937
D    1.814646
Name: 2018-10-03 00:00:00, dtype: float64
In [28]:
df.iloc[3:5,0:2]
Out[28]:
A B
2018-10-03 0.935130 -0.446711
2018-10-04 0.368753 0.272398
In [29]:
df.iloc[[1,2,4],[0,2]]
Out[29]:
A C
2018-10-01 -0.535449 -1.368044
2018-10-02 -0.989171 0.637838
2018-10-04 0.368753 -0.934959
In [30]:
df.iloc[1:3,:]
Out[30]:
A B C D
2018-10-01 -0.535449 -0.637367 -1.368044 1.035176
2018-10-02 -0.989171 0.659536 0.637838 0.176925
In [31]:
df.iloc[:,1:3]
Out[31]:
B C
2018-09-30 0.049317 1.219101
2018-10-01 -0.637367 -1.368044
2018-10-02 0.659536 0.637838
2018-10-03 -0.446711 -0.681937
2018-10-04 0.272398 -0.934959
2018-10-05 -1.866773 1.151411
In [32]:
df.iloc[1,1]
Out[32]:
-0.6373672705477403
In [33]:
df.iat[1,1]
Out[33]:
-0.6373672705477403

Boolean Indexing

In [34]:
df[df.A > 0]
Out[34]:
A B C D
2018-09-30 0.236689 0.049317 1.219101 2.583254
2018-10-03 0.935130 -0.446711 -0.681937 1.814646
2018-10-04 0.368753 0.272398 -0.934959 1.031892
2018-10-05 0.683708 -1.866773 1.151411 -0.867367
In [35]:
df.A>0
Out[35]:
2018-09-30     True
2018-10-01    False
2018-10-02    False
2018-10-03     True
2018-10-04     True
2018-10-05     True
Freq: D, Name: A, dtype: bool
In [36]:
df[df > 0]
Out[36]:
A B C D
2018-09-30 0.236689 0.049317 1.219101 2.583254
2018-10-01 NaN NaN NaN 1.035176
2018-10-02 NaN 0.659536 0.637838 0.176925
2018-10-03 0.935130 NaN NaN 1.814646
2018-10-04 0.368753 0.272398 NaN 1.031892
2018-10-05 0.683708 NaN 1.151411 NaN
In [37]:
df>0
Out[37]:
A B C D
2018-09-30 True True True True
2018-10-01 False False False True
2018-10-02 False True True True
2018-10-03 True False False True
2018-10-04 True True False True
2018-10-05 True False True False
In [38]:
df2 = df.copy()
In [39]:
df2['E'] = ['one', 'one','two','three','four','three']
In [40]:
df2
Out[40]:
A B C D E
2018-09-30 0.236689 0.049317 1.219101 2.583254 one
2018-10-01 -0.535449 -0.637367 -1.368044 1.035176 one
2018-10-02 -0.989171 0.659536 0.637838 0.176925 two
2018-10-03 0.935130 -0.446711 -0.681937 1.814646 three
2018-10-04 0.368753 0.272398 -0.934959 1.031892 four
2018-10-05 0.683708 -1.866773 1.151411 -0.867367 three
In [41]:
df2[df2['E'].isin(['two','four'])]
Out[41]:
A B C D E
2018-10-02 -0.989171 0.659536 0.637838 0.176925 two
2018-10-04 0.368753 0.272398 -0.934959 1.031892 four
In [42]:
df2['E'].isin(['two','four'])
Out[42]:
2018-09-30    False
2018-10-01    False
2018-10-02     True
2018-10-03    False
2018-10-04     True
2018-10-05    False
Freq: D, Name: E, dtype: bool

Setting

In [43]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20180930', periods=6))
s1
Out[43]:
2018-09-30    1
2018-10-01    2
2018-10-02    3
2018-10-03    4
2018-10-04    5
2018-10-05    6
Freq: D, dtype: int64
In [44]:
df['F'] = s1
In [45]:
df.at[dates[0],'A'] = 0
In [46]:
df.iat[0,1] = 0
In [47]:
df.loc[:,'D'] = np.array([5] * len(df))
In [48]:
df
Out[48]:
A B C D F
2018-09-30 0.000000 0.000000 1.219101 5 1
2018-10-01 -0.535449 -0.637367 -1.368044 5 2
2018-10-02 -0.989171 0.659536 0.637838 5 3
2018-10-03 0.935130 -0.446711 -0.681937 5 4
2018-10-04 0.368753 0.272398 -0.934959 5 5
2018-10-05 0.683708 -1.866773 1.151411 5 6
In [49]:
df2 = df.copy()
In [50]:
df2[df2 > 0] = -df2
In [51]:
df2
Out[51]:
A B C D F
2018-09-30 0.000000 0.000000 -1.219101 -5 -1
2018-10-01 -0.535449 -0.637367 -1.368044 -5 -2
2018-10-02 -0.989171 -0.659536 -0.637838 -5 -3
2018-10-03 -0.935130 -0.446711 -0.681937 -5 -4
2018-10-04 -0.368753 -0.272398 -0.934959 -5 -5
2018-10-05 -0.683708 -1.866773 -1.151411 -5 -6

Missing Data

In [52]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
In [53]:
df1.loc[dates[0]:dates[1],'E'] = 1
In [54]:
df1
Out[54]:
A B C D F E
2018-09-30 0.000000 0.000000 1.219101 5 1 1.0
2018-10-01 -0.535449 -0.637367 -1.368044 5 2 1.0
2018-10-02 -0.989171 0.659536 0.637838 5 3 NaN
2018-10-03 0.935130 -0.446711 -0.681937 5 4 NaN
In [55]:
df1.dropna(how='any')
Out[55]:
A B C D F E
2018-09-30 0.000000 0.000000 1.219101 5 1 1.0
2018-10-01 -0.535449 -0.637367 -1.368044 5 2 1.0
In [56]:
df1.fillna(value=5)
Out[56]:
A B C D F E
2018-09-30 0.000000 0.000000 1.219101 5 1 1.0
2018-10-01 -0.535449 -0.637367 -1.368044 5 2 1.0
2018-10-02 -0.989171 0.659536 0.637838 5 3 5.0
2018-10-03 0.935130 -0.446711 -0.681937 5 4 5.0
In [57]:
pd.isna(df1)
Out[57]:
A B C D F E
2018-09-30 False False False False False False
2018-10-01 False False False False False False
2018-10-02 False False False False False True
2018-10-03 False False False False False True

Operations

In [58]:
df.mean()
Out[58]:
A    0.077162
B   -0.336486
C    0.003902
D    5.000000
F    3.500000
dtype: float64
In [59]:
df.mean(1)
Out[59]:
2018-09-30    1.443820
2018-10-01    0.891828
2018-10-02    1.661641
2018-10-03    1.761296
2018-10-04    1.941238
2018-10-05    2.193669
Freq: D, dtype: float64
In [60]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s
Out[60]:
2018-09-30    NaN
2018-10-01    NaN
2018-10-02    1.0
2018-10-03    3.0
2018-10-04    5.0
2018-10-05    NaN
Freq: D, dtype: float64
In [61]:
df.sub(s, axis='index')
Out[61]:
A B C D F
2018-09-30 NaN NaN NaN NaN NaN
2018-10-01 NaN NaN NaN NaN NaN
2018-10-02 -1.989171 -0.340464 -0.362162 4.0 2.0
2018-10-03 -2.064870 -3.446711 -3.681937 2.0 1.0
2018-10-04 -4.631247 -4.727602 -5.934959 0.0 0.0
2018-10-05 NaN NaN NaN NaN NaN

Apply

In [62]:
df.apply(np.cumsum)
Out[62]:
A B C D F
2018-09-30 0.000000 0.000000 1.219101 5 1
2018-10-01 -0.535449 -0.637367 -0.148944 10 3
2018-10-02 -1.524620 0.022168 0.488895 15 6
2018-10-03 -0.589490 -0.424543 -0.193043 20 10
2018-10-04 -0.220738 -0.152146 -1.128001 25 15
2018-10-05 0.462970 -2.018918 0.023410 30 21
In [63]:
df.apply(lambda x: x.max() - x.min())
Out[63]:
A    1.924301
B    2.526308
C    2.587145
D    0.000000
F    5.000000
dtype: float64

Histogramming

In [64]:
s = pd.Series(np.random.randint(0, 7, size=10))
s
Out[64]:
0    3
1    5
2    4
3    3
4    4
5    0
6    6
7    2
8    3
9    0
dtype: int64
In [65]:
s.value_counts()
Out[65]:
3    3
4    2
0    2
6    1
5    1
2    1
dtype: int64
In [66]:
plt.hist(s)
Out[66]:
(array([2., 0., 0., 1., 0., 3., 2., 0., 1., 1.]),
 array([0. , 0.6, 1.2, 1.8, 2.4, 3. , 3.6, 4.2, 4.8, 5.4, 6. ]),
 <a list of 10 Patch objects>)

String Methods

In [67]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s
Out[67]:
0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object
In [68]:
s.str.lower()
Out[68]:
0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

Merge

Concat

In [69]:
df = pd.DataFrame(np.random.randn(10, 4))
df
Out[69]:
0 1 2 3
0 0.087077 0.619386 0.867665 -0.837620
1 0.815566 2.409637 -0.448387 1.075085
2 -0.409700 0.803823 0.363961 0.385565
3 0.530237 -0.958590 -0.667308 0.748824
4 -0.286006 -1.389019 -0.291030 -0.607074
5 0.041992 0.991613 -1.635254 -0.572434
6 2.495111 1.536428 0.349171 0.843119
7 -0.230787 0.209379 -1.429215 -0.420250
8 1.165919 -0.946511 -0.001935 -1.430094
9 -0.120528 0.210132 0.660869 -1.447023
In [70]:
pieces = [df[:3], df[3:7], df[7:]]
In [71]:
pd.concat(pieces)
Out[71]:
0 1 2 3
0 0.087077 0.619386 0.867665 -0.837620
1 0.815566 2.409637 -0.448387 1.075085
2 -0.409700 0.803823 0.363961 0.385565
3 0.530237 -0.958590 -0.667308 0.748824
4 -0.286006 -1.389019 -0.291030 -0.607074
5 0.041992 0.991613 -1.635254 -0.572434
6 2.495111 1.536428 0.349171 0.843119
7 -0.230787 0.209379 -1.429215 -0.420250
8 1.165919 -0.946511 -0.001935 -1.430094
9 -0.120528 0.210132 0.660869 -1.447023

Join

In [72]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
In [73]:
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
In [74]:
left
Out[74]:
key lval
0 foo 1
1 foo 2
In [75]:
right
Out[75]:
key rval
0 foo 4
1 foo 5
In [76]:
pd.merge(left, right, on='key')
Out[76]:
key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5
In [77]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
In [78]:
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
In [79]:
left
Out[79]:
key lval
0 foo 1
1 bar 2
In [80]:
right
Out[80]:
key rval
0 foo 4
1 bar 5
In [81]:
pd.merge(left, right, on='key')
Out[81]:
key lval rval
0 foo 1 4
1 bar 2 5

Append

In [82]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df
Out[82]:
A B C D
0 0.519225 -0.668912 0.003412 -0.412123
1 -0.154657 0.380200 0.743048 1.448302
2 2.064954 -0.047551 1.478642 -1.340999
3 -0.916340 0.097538 0.666997 1.545696
4 -0.173463 0.625218 0.882467 0.518243
5 0.659977 1.190710 0.402814 0.694023
6 -2.467640 1.653766 1.247549 0.001997
7 0.843699 1.138764 0.964801 1.405764
In [83]:
s = df.iloc[3]
In [84]:
df.append(s, ignore_index=True)
Out[84]:
A B C D
0 0.519225 -0.668912 0.003412 -0.412123
1 -0.154657 0.380200 0.743048 1.448302
2 2.064954 -0.047551 1.478642 -1.340999
3 -0.916340 0.097538 0.666997 1.545696
4 -0.173463 0.625218 0.882467 0.518243
5 0.659977 1.190710 0.402814 0.694023
6 -2.467640 1.653766 1.247549 0.001997
7 0.843699 1.138764 0.964801 1.405764
8 -0.916340 0.097538 0.666997 1.545696

Grouping

In [85]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
In [86]:
df
Out[86]:
A B C D
0 foo one 0.575231 1.468494
1 bar one 1.698981 -1.181062
2 foo two 0.194902 -1.180333
3 bar three -0.134763 0.864738
4 foo two -0.305532 0.116030
5 bar two -0.322637 0.711455
6 foo one -1.142639 0.266006
7 foo three -0.630636 -0.740960
In [87]:
df.groupby('A').sum()
Out[87]:
C D
A
bar 1.241581 0.395131
foo -1.308673 -0.070764
In [88]:
df.groupby(['A','B']).sum()
Out[88]:
C D
A B
bar one 1.698981 -1.181062
three -0.134763 0.864738
two -0.322637 0.711455
foo one -0.567408 1.734499
three -0.630636 -0.740960
two -0.110629 -1.064303

Reshaping

In [89]:
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                     'foo', 'foo', 'qux', 'qux'],
                    ['one', 'two', 'one', 'two',
                     'one', 'two', 'one', 'two']]))
In [90]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
In [91]:
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
In [92]:
df2 = df[:4]
In [93]:
df2
Out[93]:
A B
first second
bar one -0.561033 0.640060
two 0.667047 -1.773522
baz one -0.790835 0.819752
two 0.700842 -1.086048
In [94]:
stacked = df2.stack()
In [95]:
stacked
Out[95]:
first  second   
bar    one     A   -0.561033
               B    0.640060
       two     A    0.667047
               B   -1.773522
baz    one     A   -0.790835
               B    0.819752
       two     A    0.700842
               B   -1.086048
dtype: float64
In [96]:
stacked.unstack()
Out[96]:
A B
first second
bar one -0.561033 0.640060
two 0.667047 -1.773522
baz one -0.790835 0.819752
two 0.700842 -1.086048
In [97]:
stacked.unstack(1)
Out[97]:
second one two
first
bar A -0.561033 0.667047
B 0.640060 -1.773522
baz A -0.790835 0.700842
B 0.819752 -1.086048
In [98]:
stacked.unstack(0)
Out[98]:
first bar baz
second
one A -0.561033 -0.790835
B 0.640060 0.819752
two A 0.667047 0.700842
B -1.773522 -1.086048

Pivot Tables

In [99]:
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                   'B' : ['A', 'B', 'C'] * 4,
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})
In [100]:
df
Out[100]:
A B C D E
0 one A foo -2.108428 -0.969701
1 one B foo 0.870943 1.043163
2 two C foo 0.853561 0.218730
3 three A bar 0.791335 0.520778
4 one B bar 0.027449 0.716511
5 one C bar 0.054091 0.457888
6 two A foo -1.475761 -0.552329
7 three B foo -2.189474 0.418718
8 one C foo -0.797176 0.188688
9 one A bar 0.738079 -0.439365
10 two B bar 0.457438 0.411471
11 three C bar -0.865658 -0.398760
In [101]:
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
Out[101]:
C bar foo
A B
one A 0.738079 -2.108428
B 0.027449 0.870943
C 0.054091 -0.797176
three A 0.791335 NaN
B NaN -2.189474
C -0.865658 NaN
two A NaN -1.475761
B 0.457438 NaN
C NaN 0.853561

Time Series

In [102]:
rng = pd.date_range('30/09/2018', periods=100, freq='S')
rng
Out[102]:
DatetimeIndex(['2018-09-30 00:00:00', '2018-09-30 00:00:01',
               '2018-09-30 00:00:02', '2018-09-30 00:00:03',
               '2018-09-30 00:00:04', '2018-09-30 00:00:05',
               '2018-09-30 00:00:06', '2018-09-30 00:00:07',
               '2018-09-30 00:00:08', '2018-09-30 00:00:09',
               '2018-09-30 00:00:10', '2018-09-30 00:00:11',
               '2018-09-30 00:00:12', '2018-09-30 00:00:13',
               '2018-09-30 00:00:14', '2018-09-30 00:00:15',
               '2018-09-30 00:00:16', '2018-09-30 00:00:17',
               '2018-09-30 00:00:18', '2018-09-30 00:00:19',
               '2018-09-30 00:00:20', '2018-09-30 00:00:21',
               '2018-09-30 00:00:22', '2018-09-30 00:00:23',
               '2018-09-30 00:00:24', '2018-09-30 00:00:25',
               '2018-09-30 00:00:26', '2018-09-30 00:00:27',
               '2018-09-30 00:00:28', '2018-09-30 00:00:29',
               '2018-09-30 00:00:30', '2018-09-30 00:00:31',
               '2018-09-30 00:00:32', '2018-09-30 00:00:33',
               '2018-09-30 00:00:34', '2018-09-30 00:00:35',
               '2018-09-30 00:00:36', '2018-09-30 00:00:37',
               '2018-09-30 00:00:38', '2018-09-30 00:00:39',
               '2018-09-30 00:00:40', '2018-09-30 00:00:41',
               '2018-09-30 00:00:42', '2018-09-30 00:00:43',
               '2018-09-30 00:00:44', '2018-09-30 00:00:45',
               '2018-09-30 00:00:46', '2018-09-30 00:00:47',
               '2018-09-30 00:00:48', '2018-09-30 00:00:49',
               '2018-09-30 00:00:50', '2018-09-30 00:00:51',
               '2018-09-30 00:00:52', '2018-09-30 00:00:53',
               '2018-09-30 00:00:54', '2018-09-30 00:00:55',
               '2018-09-30 00:00:56', '2018-09-30 00:00:57',
               '2018-09-30 00:00:58', '2018-09-30 00:00:59',
               '2018-09-30 00:01:00', '2018-09-30 00:01:01',
               '2018-09-30 00:01:02', '2018-09-30 00:01:03',
               '2018-09-30 00:01:04', '2018-09-30 00:01:05',
               '2018-09-30 00:01:06', '2018-09-30 00:01:07',
               '2018-09-30 00:01:08', '2018-09-30 00:01:09',
               '2018-09-30 00:01:10', '2018-09-30 00:01:11',
               '2018-09-30 00:01:12', '2018-09-30 00:01:13',
               '2018-09-30 00:01:14', '2018-09-30 00:01:15',
               '2018-09-30 00:01:16', '2018-09-30 00:01:17',
               '2018-09-30 00:01:18', '2018-09-30 00:01:19',
               '2018-09-30 00:01:20', '2018-09-30 00:01:21',
               '2018-09-30 00:01:22', '2018-09-30 00:01:23',
               '2018-09-30 00:01:24', '2018-09-30 00:01:25',
               '2018-09-30 00:01:26', '2018-09-30 00:01:27',
               '2018-09-30 00:01:28', '2018-09-30 00:01:29',
               '2018-09-30 00:01:30', '2018-09-30 00:01:31',
               '2018-09-30 00:01:32', '2018-09-30 00:01:33',
               '2018-09-30 00:01:34', '2018-09-30 00:01:35',
               '2018-09-30 00:01:36', '2018-09-30 00:01:37',
               '2018-09-30 00:01:38', '2018-09-30 00:01:39'],
              dtype='datetime64[ns]', freq='S')
In [103]:
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.head()
Out[103]:
2018-09-30 00:00:00    337
2018-09-30 00:00:01    306
2018-09-30 00:00:02     48
2018-09-30 00:00:03     70
2018-09-30 00:00:04    338
Freq: S, dtype: int64
In [104]:
ts.resample('5Min').sum()
Out[104]:
2018-09-30    25733
Freq: 5T, dtype: int64
In [105]:
rng = pd.date_range('30/09/2018 00:00', periods=5, freq='D')
rng
Out[105]:
DatetimeIndex(['2018-09-30', '2018-10-01', '2018-10-02', '2018-10-03',
               '2018-10-04'],
              dtype='datetime64[ns]', freq='D')
In [106]:
ts = pd.Series(np.random.randn(len(rng)), rng)
ts
Out[106]:
2018-09-30   -0.089196
2018-10-01    0.836780
2018-10-02   -0.535983
2018-10-03    1.488694
2018-10-04    0.628257
Freq: D, dtype: float64
In [107]:
ts_utc = ts.tz_localize('UTC')
ts_utc
Out[107]:
2018-09-30 00:00:00+00:00   -0.089196
2018-10-01 00:00:00+00:00    0.836780
2018-10-02 00:00:00+00:00   -0.535983
2018-10-03 00:00:00+00:00    1.488694
2018-10-04 00:00:00+00:00    0.628257
Freq: D, dtype: float64
In [108]:
ts_utc.tz_convert('US/Eastern')
Out[108]:
2018-09-29 20:00:00-04:00   -0.089196
2018-09-30 20:00:00-04:00    0.836780
2018-10-01 20:00:00-04:00   -0.535983
2018-10-02 20:00:00-04:00    1.488694
2018-10-03 20:00:00-04:00    0.628257
Freq: D, dtype: float64
In [109]:
rng = pd.date_range('30/09/2018', periods=5, freq='M')
In [110]:
ts = pd.Series(np.random.randn(len(rng)), index=rng)
In [111]:
ts
Out[111]:
2018-09-30   -0.193251
2018-10-31    2.234970
2018-11-30   -0.470974
2018-12-31   -0.368833
2019-01-31   -0.768082
Freq: M, dtype: float64
In [112]:
ps = ts.to_period()
ps
Out[112]:
2018-09   -0.193251
2018-10    2.234970
2018-11   -0.470974
2018-12   -0.368833
2019-01   -0.768082
Freq: M, dtype: float64
In [113]:
ps.to_timestamp()
Out[113]:
2018-09-01   -0.193251
2018-10-01    2.234970
2018-11-01   -0.470974
2018-12-01   -0.368833
2019-01-01   -0.768082
Freq: MS, dtype: float64
In [114]:
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
prng
Out[114]:
PeriodIndex(['1990Q1', '1990Q2', '1990Q3', '1990Q4', '1991Q1', '1991Q2',
             '1991Q3', '1991Q4', '1992Q1', '1992Q2', '1992Q3', '1992Q4',
             '1993Q1', '1993Q2', '1993Q3', '1993Q4', '1994Q1', '1994Q2',
             '1994Q3', '1994Q4', '1995Q1', '1995Q2', '1995Q3', '1995Q4',
             '1996Q1', '1996Q2', '1996Q3', '1996Q4', '1997Q1', '1997Q2',
             '1997Q3', '1997Q4', '1998Q1', '1998Q2', '1998Q3', '1998Q4',
             '1999Q1', '1999Q2', '1999Q3', '1999Q4', '2000Q1', '2000Q2',
             '2000Q3', '2000Q4'],
            dtype='period[Q-NOV]', freq='Q-NOV')
In [115]:
ts = pd.Series(np.random.randn(len(prng)), prng)
ts.head()
Out[115]:
1990Q1    0.622639
1990Q2   -0.308812
1990Q3    1.132039
1990Q4   -0.143473
1991Q1    1.394870
Freq: Q-NOV, dtype: float64
In [116]:
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
ts.head()
Out[116]:
1990-03-01 09:00    0.622639
1990-06-01 09:00   -0.308812
1990-09-01 09:00    1.132039
1990-12-01 09:00   -0.143473
1991-03-01 09:00    1.394870
Freq: H, dtype: float64

Categoricals

In [117]:
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
In [118]:
df["grade"] = df["raw_grade"].astype("category")
In [119]:
df["grade"]
Out[119]:
0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]
In [120]:
df["grade"].cat.categories = ["very good", "good", "very bad"]
df['grade']
Out[120]:
0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (3, object): [very good, good, very bad]
In [121]:
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
df.grade
Out[121]:
0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]
In [122]:
df.sort_values(by="grade")
Out[122]:
id raw_grade grade
5 6 e very bad
1 2 b good
2 3 b good
0 1 a very good
3 4 a very good
4 5 a very good
In [123]:
df.groupby("grade").size()
Out[123]:
grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64

Plotting

In [124]:
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
In [125]:
ts = ts.cumsum()
In [126]:
ts.plot()
Out[126]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f48c8ee1e80>
In [127]:
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
                  columns=['A', 'B', 'C', 'D'])
In [128]:
df = df.cumsum()
In [129]:
df.plot()
Out[129]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f48c8e75780>

CSV

In [130]:
df.to_csv('foo.csv')
In [131]:
pd.read_csv('foo.csv').head()
Out[131]:
Unnamed: 0 A B C D
0 2000-01-01 -0.243603 0.063758 -0.762494 -1.522712
1 2000-01-02 -0.634656 -1.204408 -2.555926 -2.590963
2 2000-01-03 -0.111139 -1.358780 -2.827295 -2.115130
3 2000-01-04 1.218165 -2.095825 -2.950879 -2.788545
4 2000-01-05 3.211926 -2.986790 -2.338792 -2.790208

Excel

In [132]:
df.to_excel('foo.xlsx', sheet_name='Sheet1')
In [133]:
pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA']).head() ##requires xlrd
Out[133]:
A B C D
2000-01-01 -0.243603 0.063758 -0.762494 -1.522712
2000-01-02 -0.634656 -1.204408 -2.555926 -2.590963
2000-01-03 -0.111139 -1.358780 -2.827295 -2.115130
2000-01-04 1.218165 -2.095825 -2.950879 -2.788545
2000-01-05 3.211926 -2.986790 -2.338792 -2.790208