import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

Object Creation¶

s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

dates = pd.date_range('20180930', periods=6)
dates

DatetimeIndex(['2018-09-30', '2018-10-01', '2018-10-02', '2018-10-03',
               '2018-10-04', '2018-10-05'],
              dtype='datetime64[ns]', freq='D')

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

df2 = pd.DataFrame({ 'A' : 1.,
                    'B' : pd.Timestamp('20180930'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })
df2

df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

#df2.<TAB>

Viewing Data¶

df.head()

df.tail(3)

df.index

DatetimeIndex(['2018-09-30', '2018-10-01', '2018-10-02', '2018-10-03',
               '2018-10-04', '2018-10-05'],
              dtype='datetime64[ns]', freq='D')

df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

df.values

array([[ 0.23668901,  0.04931661,  1.21910064,  2.58325361],
       [-0.53544914, -0.63736727, -1.36804416,  1.03517569],
       [-0.98917112,  0.65953558,  0.63783813,  0.17692478],
       [ 0.93513023, -0.44671139, -0.68193729,  1.81464572],
       [ 0.36875252,  0.27239755, -0.93495857,  1.03189194],
       [ 0.68370779, -1.86677263,  1.15141131, -0.86736677]])

df.describe()

df.T

df.sort_index(axis=1, ascending=False)

df.sort_values(by='B')

Selection¶

df['A'] #df.A

2018-09-30    0.236689
2018-10-01   -0.535449
2018-10-02   -0.989171
2018-10-03    0.935130
2018-10-04    0.368753
2018-10-05    0.683708
Freq: D, Name: A, dtype: float64

df[0:3]

df['20181001':'20181002']

Selection by Label¶

df.head()

df.loc[dates[0]]

A    0.236689
B    0.049317
C    1.219101
D    2.583254
Name: 2018-09-30 00:00:00, dtype: float64

df.loc[:,['A','B']]

df.loc['20181001':'20181002',['A','B']]

df.loc['20181002',['A','B']]

A   -0.989171
B    0.659536
Name: 2018-10-02 00:00:00, dtype: float64

df.loc[dates[0],'A']

0.2366890083832509

df.at[dates[0],'A']

0.2366890083832509

Selection by Position¶

df.iloc[3]

A    0.935130
B   -0.446711
C   -0.681937
D    1.814646
Name: 2018-10-03 00:00:00, dtype: float64

df.iloc[3:5,0:2]

df.iloc[[1,2,4],[0,2]]

df.iloc[1:3,:]

df.iloc[:,1:3]

df.iloc[1,1]

-0.6373672705477403

df.iat[1,1]

-0.6373672705477403

Boolean Indexing¶

df[df.A > 0]

df.A>0

2018-09-30     True
2018-10-01    False
2018-10-02    False
2018-10-03     True
2018-10-04     True
2018-10-05     True
Freq: D, Name: A, dtype: bool

df[df > 0]

df>0

df2 = df.copy()

df2['E'] = ['one', 'one','two','three','four','three']

df2

df2[df2['E'].isin(['two','four'])]

df2['E'].isin(['two','four'])

2018-09-30    False
2018-10-01    False
2018-10-02     True
2018-10-03    False
2018-10-04     True
2018-10-05    False
Freq: D, Name: E, dtype: bool

Setting¶

s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20180930', periods=6))
s1

2018-09-30    1
2018-10-01    2
2018-10-02    3
2018-10-03    4
2018-10-04    5
2018-10-05    6
Freq: D, dtype: int64

df['F'] = s1

df.at[dates[0],'A'] = 0

df.iat[0,1] = 0

df.loc[:,'D'] = np.array([5] * len(df))

df

df2 = df.copy()

df2[df2 > 0] = -df2

df2

Missing Data¶

df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])

df1.loc[dates[0]:dates[1],'E'] = 1

df1

df1.dropna(how='any')

df1.fillna(value=5)

pd.isna(df1)

Operations¶

df.mean()

A    0.077162
B   -0.336486
C    0.003902
D    5.000000
F    3.500000
dtype: float64

df.mean(1)

2018-09-30    1.443820
2018-10-01    0.891828
2018-10-02    1.661641
2018-10-03    1.761296
2018-10-04    1.941238
2018-10-05    2.193669
Freq: D, dtype: float64

s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s

2018-09-30    NaN
2018-10-01    NaN
2018-10-02    1.0
2018-10-03    3.0
2018-10-04    5.0
2018-10-05    NaN
Freq: D, dtype: float64

df.sub(s, axis='index')

Apply¶

df.apply(np.cumsum)

df.apply(lambda x: x.max() - x.min())

A    1.924301
B    2.526308
C    2.587145
D    0.000000
F    5.000000
dtype: float64

Histogramming¶

s = pd.Series(np.random.randint(0, 7, size=10))
s

0    3
1    5
2    4
3    3
4    4
5    0
6    6
7    2
8    3
9    0
dtype: int64

s.value_counts()

3    3
4    2
0    2
6    1
5    1
2    1
dtype: int64

plt.hist(s)

(array([2., 0., 0., 1., 0., 3., 2., 0., 1., 1.]),
 array([0. , 0.6, 1.2, 1.8, 2.4, 3. , 3.6, 4.2, 4.8, 5.4, 6. ]),
 <a list of 10 Patch objects>)

String Methods¶

s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

Merge¶

Concat¶

df = pd.DataFrame(np.random.randn(10, 4))
df

pieces = [df[:3], df[3:7], df[7:]]

pd.concat(pieces)

Join¶

left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})

right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

left

right

pd.merge(left, right, on='key')

left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})

right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

left

right

pd.merge(left, right, on='key')

Append¶

df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df

s = df.iloc[3]

df.append(s, ignore_index=True)

Grouping¶

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})

df

df.groupby('A').sum()

df.groupby(['A','B']).sum()

Reshaping¶

tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                     'foo', 'foo', 'qux', 'qux'],
                    ['one', 'two', 'one', 'two',
                     'one', 'two', 'one', 'two']]))

index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])

df2 = df[:4]

df2

stacked = df2.stack()

stacked

first  second   
bar    one     A   -0.561033
               B    0.640060
       two     A    0.667047
               B   -1.773522
baz    one     A   -0.790835
               B    0.819752
       two     A    0.700842
               B   -1.086048
dtype: float64

stacked.unstack()

stacked.unstack(1)

stacked.unstack(0)

Pivot Tables¶

df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                   'B' : ['A', 'B', 'C'] * 4,
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})

df

pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

Time Series¶

rng = pd.date_range('30/09/2018', periods=100, freq='S')
rng

DatetimeIndex(['2018-09-30 00:00:00', '2018-09-30 00:00:01',
               '2018-09-30 00:00:02', '2018-09-30 00:00:03',
               '2018-09-30 00:00:04', '2018-09-30 00:00:05',
               '2018-09-30 00:00:06', '2018-09-30 00:00:07',
               '2018-09-30 00:00:08', '2018-09-30 00:00:09',
               '2018-09-30 00:00:10', '2018-09-30 00:00:11',
               '2018-09-30 00:00:12', '2018-09-30 00:00:13',
               '2018-09-30 00:00:14', '2018-09-30 00:00:15',
               '2018-09-30 00:00:16', '2018-09-30 00:00:17',
               '2018-09-30 00:00:18', '2018-09-30 00:00:19',
               '2018-09-30 00:00:20', '2018-09-30 00:00:21',
               '2018-09-30 00:00:22', '2018-09-30 00:00:23',
               '2018-09-30 00:00:24', '2018-09-30 00:00:25',
               '2018-09-30 00:00:26', '2018-09-30 00:00:27',
               '2018-09-30 00:00:28', '2018-09-30 00:00:29',
               '2018-09-30 00:00:30', '2018-09-30 00:00:31',
               '2018-09-30 00:00:32', '2018-09-30 00:00:33',
               '2018-09-30 00:00:34', '2018-09-30 00:00:35',
               '2018-09-30 00:00:36', '2018-09-30 00:00:37',
               '2018-09-30 00:00:38', '2018-09-30 00:00:39',
               '2018-09-30 00:00:40', '2018-09-30 00:00:41',
               '2018-09-30 00:00:42', '2018-09-30 00:00:43',
               '2018-09-30 00:00:44', '2018-09-30 00:00:45',
               '2018-09-30 00:00:46', '2018-09-30 00:00:47',
               '2018-09-30 00:00:48', '2018-09-30 00:00:49',
               '2018-09-30 00:00:50', '2018-09-30 00:00:51',
               '2018-09-30 00:00:52', '2018-09-30 00:00:53',
               '2018-09-30 00:00:54', '2018-09-30 00:00:55',
               '2018-09-30 00:00:56', '2018-09-30 00:00:57',
               '2018-09-30 00:00:58', '2018-09-30 00:00:59',
               '2018-09-30 00:01:00', '2018-09-30 00:01:01',
               '2018-09-30 00:01:02', '2018-09-30 00:01:03',
               '2018-09-30 00:01:04', '2018-09-30 00:01:05',
               '2018-09-30 00:01:06', '2018-09-30 00:01:07',
               '2018-09-30 00:01:08', '2018-09-30 00:01:09',
               '2018-09-30 00:01:10', '2018-09-30 00:01:11',
               '2018-09-30 00:01:12', '2018-09-30 00:01:13',
               '2018-09-30 00:01:14', '2018-09-30 00:01:15',
               '2018-09-30 00:01:16', '2018-09-30 00:01:17',
               '2018-09-30 00:01:18', '2018-09-30 00:01:19',
               '2018-09-30 00:01:20', '2018-09-30 00:01:21',
               '2018-09-30 00:01:22', '2018-09-30 00:01:23',
               '2018-09-30 00:01:24', '2018-09-30 00:01:25',
               '2018-09-30 00:01:26', '2018-09-30 00:01:27',
               '2018-09-30 00:01:28', '2018-09-30 00:01:29',
               '2018-09-30 00:01:30', '2018-09-30 00:01:31',
               '2018-09-30 00:01:32', '2018-09-30 00:01:33',
               '2018-09-30 00:01:34', '2018-09-30 00:01:35',
               '2018-09-30 00:01:36', '2018-09-30 00:01:37',
               '2018-09-30 00:01:38', '2018-09-30 00:01:39'],
              dtype='datetime64[ns]', freq='S')

ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.head()

2018-09-30 00:00:00    337
2018-09-30 00:00:01    306
2018-09-30 00:00:02     48
2018-09-30 00:00:03     70
2018-09-30 00:00:04    338
Freq: S, dtype: int64

ts.resample('5Min').sum()

2018-09-30    25733
Freq: 5T, dtype: int64

rng = pd.date_range('30/09/2018 00:00', periods=5, freq='D')
rng

DatetimeIndex(['2018-09-30', '2018-10-01', '2018-10-02', '2018-10-03',
               '2018-10-04'],
              dtype='datetime64[ns]', freq='D')

ts = pd.Series(np.random.randn(len(rng)), rng)
ts

2018-09-30   -0.089196
2018-10-01    0.836780
2018-10-02   -0.535983
2018-10-03    1.488694
2018-10-04    0.628257
Freq: D, dtype: float64

ts_utc = ts.tz_localize('UTC')
ts_utc

2018-09-30 00:00:00+00:00   -0.089196
2018-10-01 00:00:00+00:00    0.836780
2018-10-02 00:00:00+00:00   -0.535983
2018-10-03 00:00:00+00:00    1.488694
2018-10-04 00:00:00+00:00    0.628257
Freq: D, dtype: float64

ts_utc.tz_convert('US/Eastern')

2018-09-29 20:00:00-04:00   -0.089196
2018-09-30 20:00:00-04:00    0.836780
2018-10-01 20:00:00-04:00   -0.535983
2018-10-02 20:00:00-04:00    1.488694
2018-10-03 20:00:00-04:00    0.628257
Freq: D, dtype: float64

rng = pd.date_range('30/09/2018', periods=5, freq='M')

ts = pd.Series(np.random.randn(len(rng)), index=rng)

ts

2018-09-30   -0.193251
2018-10-31    2.234970
2018-11-30   -0.470974
2018-12-31   -0.368833
2019-01-31   -0.768082
Freq: M, dtype: float64

ps = ts.to_period()
ps

2018-09   -0.193251
2018-10    2.234970
2018-11   -0.470974
2018-12   -0.368833
2019-01   -0.768082
Freq: M, dtype: float64

ps.to_timestamp()

2018-09-01   -0.193251
2018-10-01    2.234970
2018-11-01   -0.470974
2018-12-01   -0.368833
2019-01-01   -0.768082
Freq: MS, dtype: float64

prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
prng

PeriodIndex(['1990Q1', '1990Q2', '1990Q3', '1990Q4', '1991Q1', '1991Q2',
             '1991Q3', '1991Q4', '1992Q1', '1992Q2', '1992Q3', '1992Q4',
             '1993Q1', '1993Q2', '1993Q3', '1993Q4', '1994Q1', '1994Q2',
             '1994Q3', '1994Q4', '1995Q1', '1995Q2', '1995Q3', '1995Q4',
             '1996Q1', '1996Q2', '1996Q3', '1996Q4', '1997Q1', '1997Q2',
             '1997Q3', '1997Q4', '1998Q1', '1998Q2', '1998Q3', '1998Q4',
             '1999Q1', '1999Q2', '1999Q3', '1999Q4', '2000Q1', '2000Q2',
             '2000Q3', '2000Q4'],
            dtype='period[Q-NOV]', freq='Q-NOV')

ts = pd.Series(np.random.randn(len(prng)), prng)
ts.head()

1990Q1    0.622639
1990Q2   -0.308812
1990Q3    1.132039
1990Q4   -0.143473
1991Q1    1.394870
Freq: Q-NOV, dtype: float64

ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
ts.head()

1990-03-01 09:00    0.622639
1990-06-01 09:00   -0.308812
1990-09-01 09:00    1.132039
1990-12-01 09:00   -0.143473
1991-03-01 09:00    1.394870
Freq: H, dtype: float64

Categoricals¶

df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})

df["grade"] = df["raw_grade"].astype("category")

df["grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]

df["grade"].cat.categories = ["very good", "good", "very bad"]
df['grade']

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (3, object): [very good, good, very bad]

df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
df.grade

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]

df.sort_values(by="grade")

df.groupby("grade").size()

grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64

Plotting¶

ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))

ts = ts.cumsum()

ts.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f48c8ee1e80>

df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
                  columns=['A', 'B', 'C', 'D'])

df = df.cumsum()

df.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f48c8e75780>

CSV¶

df.to_csv('foo.csv')

pd.read_csv('foo.csv').head()

Excel¶

df.to_excel('foo.xlsx', sheet_name='Sheet1')

pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA']).head() ##requires xlrd

	A	B	C	D
count	6.000000	6.000000	6.000000	6.000000
mean	0.116610	-0.328267	0.003902	0.962421
std	0.737225	0.889460	1.133969	1.210672
min	-0.989171	-1.866773	-1.368044	-0.867367
25%	-0.342415	-0.589703	-0.871703	0.390667
50%	0.302721	-0.198697	-0.022050	1.033534
75%	0.604969	0.216627	1.023018	1.619778
max	0.935130	0.659536	1.219101	2.583254

	A	B	C	D	F
2018-09-30	0.000000	0.000000	-1.219101	-5	-1
2018-10-01	-0.535449	-0.637367	-1.368044	-5	-2
2018-10-02	-0.989171	-0.659536	-0.637838	-5	-3
2018-10-03	-0.935130	-0.446711	-0.681937	-5	-4
2018-10-04	-0.368753	-0.272398	-0.934959	-5	-5
2018-10-05	-0.683708	-1.866773	-1.151411	-5	-6

	0	1	2	3
0	0.087077	0.619386	0.867665	-0.837620
1	0.815566	2.409637	-0.448387	1.075085
2	-0.409700	0.803823	0.363961	0.385565
3	0.530237	-0.958590	-0.667308	0.748824
4	-0.286006	-1.389019	-0.291030	-0.607074
5	0.041992	0.991613	-1.635254	-0.572434
6	2.495111	1.536428	0.349171	0.843119
7	-0.230787	0.209379	-1.429215	-0.420250
8	1.165919	-0.946511	-0.001935	-1.430094
9	-0.120528	0.210132	0.660869	-1.447023

	0	1	2	3
0	0.087077	0.619386	0.867665	-0.837620
1	0.815566	2.409637	-0.448387	1.075085
2	-0.409700	0.803823	0.363961	0.385565
3	0.530237	-0.958590	-0.667308	0.748824
4	-0.286006	-1.389019	-0.291030	-0.607074
5	0.041992	0.991613	-1.635254	-0.572434
6	2.495111	1.536428	0.349171	0.843119
7	-0.230787	0.209379	-1.429215	-0.420250
8	1.165919	-0.946511	-0.001935	-1.430094
9	-0.120528	0.210132	0.660869	-1.447023

	A	B	C	D
0	0.519225	-0.668912	0.003412	-0.412123
1	-0.154657	0.380200	0.743048	1.448302
2	2.064954	-0.047551	1.478642	-1.340999
3	-0.916340	0.097538	0.666997	1.545696
4	-0.173463	0.625218	0.882467	0.518243
5	0.659977	1.190710	0.402814	0.694023
6	-2.467640	1.653766	1.247549	0.001997
7	0.843699	1.138764	0.964801	1.405764

	A	B	C	D
2018-09-30	0.236689	0.049317	1.219101	2.583254
2018-10-01	-0.535449	-0.637367	-1.368044	1.035176
2018-10-02	-0.989171	0.659536	0.637838	0.176925
2018-10-03	0.935130	-0.446711	-0.681937	1.814646
2018-10-04	0.368753	0.272398	-0.934959	1.031892
2018-10-05	0.683708	-1.866773	1.151411	-0.867367

	A	B	C	D	E	F
0	1.0	2018-09-30	1.0	3	test	foo
1	1.0	2018-09-30	1.0	3	train	foo
2	1.0	2018-09-30	1.0	3	test	foo
3	1.0	2018-09-30	1.0	3	train	foo

	A	B	C	D
2018-09-30	True	True	True	True
2018-10-01	False	False	False	True
2018-10-02	False	True	True	True
2018-10-03	True	False	False	True
2018-10-04	True	True	False	True
2018-10-05	True	False	True	False

	A	B	C	D
0	foo	one	0.575231	1.468494
1	bar	one	1.698981	-1.181062
2	foo	two	0.194902	-1.180333
3	bar	three	-0.134763	0.864738
4	foo	two	-0.305532	0.116030
5	bar	two	-0.322637	0.711455
6	foo	one	-1.142639	0.266006
7	foo	three	-0.630636	-0.740960

		A	B
first	second
bar	one	-0.561033	0.640060
bar	two	0.667047	-1.773522
baz	one	-0.790835	0.819752
baz	two	0.700842	-1.086048

	A	B	C	D	E
0	one	A	foo	-2.108428	-0.969701
1	one	B	foo	0.870943	1.043163
2	two	C	foo	0.853561	0.218730
3	three	A	bar	0.791335	0.520778
4	one	B	bar	0.027449	0.716511
5	one	C	bar	0.054091	0.457888
6	two	A	foo	-1.475761	-0.552329
7	three	B	foo	-2.189474	0.418718
8	one	C	foo	-0.797176	0.188688
9	one	A	bar	0.738079	-0.439365
10	two	B	bar	0.457438	0.411471
11	three	C	bar	-0.865658	-0.398760

	id	raw_grade	grade
5	6	e	very bad
1	2	b	good
2	3	b	good
0	1	a	very good
3	4	a	very good
4	5	a	very good

	Unnamed: 0	A	B	C	D
0	2000-01-01	-0.243603	0.063758	-0.762494	-1.522712
1	2000-01-02	-0.634656	-1.204408	-2.555926	-2.590963
2	2000-01-03	-0.111139	-1.358780	-2.827295	-2.115130
3	2000-01-04	1.218165	-2.095825	-2.950879	-2.788545
4	2000-01-05	3.211926	-2.986790	-2.338792	-2.790208

	A	B	C	D	F
2018-09-30	0.000000	0.000000	1.219101	5	1
2018-10-01	-0.535449	-0.637367	-0.148944	10	3
2018-10-02	-1.524620	0.022168	0.488895	15	6
2018-10-03	-0.589490	-0.424543	-0.193043	20	10
2018-10-04	-0.220738	-0.152146	-1.128001	25	15
2018-10-05	0.462970	-2.018918	0.023410	30	21

	key	lval
0	foo	1
1	foo	2

	key	rval
0	foo	4
1	foo	5

	key	lval
0	foo	1
1	bar	2