import pandas as pd
import numpy as np
!ls sales*.xlsx
sales-feb-2014.xlsx
sales-jan-2014.xlsx
sales-mar-2014.xlsx
import glob
glob.glob('sales*.xlsx')
['sales-feb-2014.xlsx', 'sales-jan-2014.xlsx', 'sales-mar-2014.xlsx']
all_data = pd.DataFrame()
for f in glob.glob('sales*.xlsx'):
df = pd.read_excel(f)
all_data = all_data.append(df, ignore_index=True)
all_data.head()
|
account number |
name |
sku |
quantity |
unit price |
ext price |
date |
0 |
383080 |
Will LLC |
B1-20000 |
7 |
33.69 |
235.83 |
2014-02-01 09:04:59 |
1 |
412290 |
Jerde-Hilpert |
S1-27722 |
11 |
21.12 |
232.32 |
2014-02-01 11:51:46 |
2 |
412290 |
Jerde-Hilpert |
B1-86481 |
3 |
35.99 |
107.97 |
2014-02-01 17:24:32 |
3 |
412290 |
Jerde-Hilpert |
B1-20000 |
23 |
78.90 |
1814.70 |
2014-02-01 19:56:48 |
4 |
672390 |
Kuhn-Gusikowski |
S1-06532 |
48 |
55.82 |
2679.36 |
2014-02-02 03:45:20 |
all_data['date'] = pd.to_datetime(all_data['date'])
all_data.head()
|
account number |
name |
sku |
quantity |
unit price |
ext price |
date |
0 |
383080 |
Will LLC |
B1-20000 |
7 |
33.69 |
235.83 |
2014-02-01 09:04:59 |
1 |
412290 |
Jerde-Hilpert |
S1-27722 |
11 |
21.12 |
232.32 |
2014-02-01 11:51:46 |
2 |
412290 |
Jerde-Hilpert |
B1-86481 |
3 |
35.99 |
107.97 |
2014-02-01 17:24:32 |
3 |
412290 |
Jerde-Hilpert |
B1-20000 |
23 |
78.90 |
1814.70 |
2014-02-01 19:56:48 |
4 |
672390 |
Kuhn-Gusikowski |
S1-06532 |
48 |
55.82 |
2679.36 |
2014-02-02 03:45:20 |
status = pd.read_excel('customer-status.xlsx')
status
|
account number |
name |
status |
0 |
740150 |
Barton LLC |
gold |
1 |
714466 |
Trantow-Barrows |
silver |
2 |
218895 |
Kulas Inc |
bronze |
3 |
307599 |
Kassulke, Ondricka and Metz |
bronze |
4 |
412290 |
Jerde-Hilpert |
bronze |
5 |
729833 |
Koepp Ltd |
silver |
6 |
146832 |
Kiehn-Spinka |
silver |
7 |
688981 |
Keeling LLC |
silver |
8 |
786968 |
Frami, Hills and Schmidt |
silver |
9 |
239344 |
Stokes LLC |
gold |
10 |
672390 |
Kuhn-Gusikowski |
silver |
11 |
141962 |
Herman LLC |
gold |
12 |
424914 |
White-Trantow |
silver |
13 |
527099 |
Sanford and Sons |
bronze |
14 |
642753 |
Pollich LLC |
bronze |
15 |
257198 |
Cronin, Oberbrunner and Spencer |
gold |
all_data_st = pd.merge(all_data, status, how='left')
all_data_st.head()
|
account number |
name |
sku |
quantity |
unit price |
ext price |
date |
status |
0 |
383080 |
Will LLC |
B1-20000 |
7 |
33.69 |
235.83 |
2014-02-01 09:04:59 |
NaN |
1 |
412290 |
Jerde-Hilpert |
S1-27722 |
11 |
21.12 |
232.32 |
2014-02-01 11:51:46 |
bronze |
2 |
412290 |
Jerde-Hilpert |
B1-86481 |
3 |
35.99 |
107.97 |
2014-02-01 17:24:32 |
bronze |
3 |
412290 |
Jerde-Hilpert |
B1-20000 |
23 |
78.90 |
1814.70 |
2014-02-01 19:56:48 |
bronze |
4 |
672390 |
Kuhn-Gusikowski |
S1-06532 |
48 |
55.82 |
2679.36 |
2014-02-02 03:45:20 |
silver |
all_data_st[all_data_st['account number']==737550]
|
account number |
name |
sku |
quantity |
unit price |
ext price |
date |
status |
15 |
737550 |
Fritsch, Russel and Anderson |
S1-47412 |
40 |
51.01 |
2040.40 |
2014-02-05 01:20:40 |
NaN |
25 |
737550 |
Fritsch, Russel and Anderson |
S1-06532 |
34 |
18.69 |
635.46 |
2014-02-07 09:22:02 |
NaN |
66 |
737550 |
Fritsch, Russel and Anderson |
S1-27722 |
15 |
70.23 |
1053.45 |
2014-02-16 18:24:42 |
NaN |
78 |
737550 |
Fritsch, Russel and Anderson |
S2-34077 |
26 |
93.35 |
2427.10 |
2014-02-20 18:45:43 |
NaN |
80 |
737550 |
Fritsch, Russel and Anderson |
S1-93683 |
31 |
10.52 |
326.12 |
2014-02-21 13:55:45 |
NaN |
91 |
737550 |
Fritsch, Russel and Anderson |
S2-82423 |
46 |
67.67 |
3112.82 |
2014-02-24 08:35:31 |
NaN |
117 |
737550 |
Fritsch, Russel and Anderson |
S2-82423 |
14 |
81.92 |
1146.88 |
2014-01-03 19:07:37 |
NaN |
122 |
737550 |
Fritsch, Russel and Anderson |
B1-53102 |
23 |
71.56 |
1645.88 |
2014-01-04 08:57:48 |
NaN |
134 |
737550 |
Fritsch, Russel and Anderson |
B1-53636 |
42 |
42.06 |
1766.52 |
2014-01-08 00:02:11 |
NaN |
140 |
737550 |
Fritsch, Russel and Anderson |
S1-27722 |
20 |
29.54 |
590.80 |
2014-01-09 13:20:40 |
NaN |
150 |
737550 |
Fritsch, Russel and Anderson |
S1-93683 |
22 |
71.68 |
1576.96 |
2014-01-11 23:47:36 |
NaN |
171 |
737550 |
Fritsch, Russel and Anderson |
S2-77896 |
41 |
38.43 |
1575.63 |
2014-01-16 22:39:11 |
NaN |
177 |
737550 |
Fritsch, Russel and Anderson |
S2-82423 |
13 |
24.98 |
324.74 |
2014-01-18 06:32:00 |
NaN |
209 |
737550 |
Fritsch, Russel and Anderson |
B1-53102 |
34 |
99.48 |
3382.32 |
2014-01-22 22:00:08 |
NaN |
223 |
737550 |
Fritsch, Russel and Anderson |
B1-65551 |
49 |
19.14 |
937.86 |
2014-01-27 01:59:54 |
NaN |
232 |
737550 |
Fritsch, Russel and Anderson |
S1-65481 |
47 |
45.71 |
2148.37 |
2014-01-28 15:49:45 |
NaN |
238 |
737550 |
Fritsch, Russel and Anderson |
S1-50961 |
1 |
34.81 |
34.81 |
2014-01-30 23:56:20 |
NaN |
245 |
737550 |
Fritsch, Russel and Anderson |
B1-50809 |
20 |
50.11 |
1002.20 |
2014-03-01 23:47:17 |
NaN |
258 |
737550 |
Fritsch, Russel and Anderson |
S2-83881 |
12 |
63.60 |
763.20 |
2014-03-04 15:26:20 |
NaN |
336 |
737550 |
Fritsch, Russel and Anderson |
S2-11481 |
47 |
49.22 |
2313.34 |
2014-03-22 23:44:18 |
NaN |
339 |
737550 |
Fritsch, Russel and Anderson |
S1-93683 |
22 |
18.19 |
400.18 |
2014-03-23 10:33:33 |
NaN |
379 |
737550 |
Fritsch, Russel and Anderson |
B1-65551 |
12 |
56.24 |
674.88 |
2014-03-31 08:43:24 |
NaN |
all_data_st['status'].fillna('bronze')
all_data_st.head()
|
account number |
name |
sku |
quantity |
unit price |
ext price |
date |
status |
0 |
383080 |
Will LLC |
B1-20000 |
7 |
33.69 |
235.83 |
2014-02-01 09:04:59 |
NaN |
1 |
412290 |
Jerde-Hilpert |
S1-27722 |
11 |
21.12 |
232.32 |
2014-02-01 11:51:46 |
bronze |
2 |
412290 |
Jerde-Hilpert |
B1-86481 |
3 |
35.99 |
107.97 |
2014-02-01 17:24:32 |
bronze |
3 |
412290 |
Jerde-Hilpert |
B1-20000 |
23 |
78.90 |
1814.70 |
2014-02-01 19:56:48 |
bronze |
4 |
672390 |
Kuhn-Gusikowski |
S1-06532 |
48 |
55.82 |
2679.36 |
2014-02-02 03:45:20 |
silver |
all_data_st['status'] = all_data_st['status'].astype('category')
all_data_st.head()
|
account number |
name |
sku |
quantity |
unit price |
ext price |
date |
status |
0 |
383080 |
Will LLC |
B1-20000 |
7 |
33.69 |
235.83 |
2014-02-01 09:04:59 |
NaN |
1 |
412290 |
Jerde-Hilpert |
S1-27722 |
11 |
21.12 |
232.32 |
2014-02-01 11:51:46 |
bronze |
2 |
412290 |
Jerde-Hilpert |
B1-86481 |
3 |
35.99 |
107.97 |
2014-02-01 17:24:32 |
bronze |
3 |
412290 |
Jerde-Hilpert |
B1-20000 |
23 |
78.90 |
1814.70 |
2014-02-01 19:56:48 |
bronze |
4 |
672390 |
Kuhn-Gusikowski |
S1-06532 |
48 |
55.82 |
2679.36 |
2014-02-02 03:45:20 |
silver |
all_data_st.dtypes
account number int64
name object
sku object
quantity int64
unit price float64
ext price float64
date datetime64[ns]
status category
dtype: object
all_data_st.sort_values('status').head()
|
account number |
name |
sku |
quantity |
unit price |
ext price |
date |
status |
0 |
383080 |
Will LLC |
B1-20000 |
7 |
33.69 |
235.83 |
2014-02-01 09:04:59 |
NaN |
258 |
737550 |
Fritsch, Russel and Anderson |
S2-83881 |
12 |
63.60 |
763.20 |
2014-03-04 15:26:20 |
NaN |
261 |
383080 |
Will LLC |
S1-93683 |
28 |
90.86 |
2544.08 |
2014-03-05 05:11:49 |
NaN |
66 |
737550 |
Fritsch, Russel and Anderson |
S1-27722 |
15 |
70.23 |
1053.45 |
2014-02-16 18:24:42 |
NaN |
201 |
383080 |
Will LLC |
B1-53636 |
37 |
14.40 |
532.80 |
2014-01-20 22:18:21 |
NaN |
all_data_st['status'].cat.set_categories(['gold', 'silver', 'bronze'], inplace=True)
all_data_st.sort_values('status').head()
|
account number |
name |
sku |
quantity |
unit price |
ext price |
date |
status |
0 |
383080 |
Will LLC |
B1-20000 |
7 |
33.69 |
235.83 |
2014-02-01 09:04:59 |
NaN |
134 |
737550 |
Fritsch, Russel and Anderson |
B1-53636 |
42 |
42.06 |
1766.52 |
2014-01-08 00:02:11 |
NaN |
289 |
383080 |
Will LLC |
S1-82801 |
3 |
77.06 |
231.18 |
2014-03-11 16:38:10 |
NaN |
122 |
737550 |
Fritsch, Russel and Anderson |
B1-53102 |
23 |
71.56 |
1645.88 |
2014-01-04 08:57:48 |
NaN |
299 |
604255 |
Halvorson, Crona and Champlin |
S1-47412 |
27 |
25.69 |
693.63 |
2014-03-14 16:01:34 |
NaN |
all_data_st['status'].describe()
count 307
unique 3
top silver
freq 140
Name: status, dtype: object
all_data_st.groupby(['status'])['quantity', 'unit price', 'ext price'].mean()
|
quantity |
unit price |
ext price |
status |
|
|
|
gold |
24.375000 |
53.723889 |
1351.944583 |
silver |
22.842857 |
57.272714 |
1320.032214 |
bronze |
24.284211 |
59.649684 |
1452.321368 |
all_data_st.groupby(['status'])['quantity', 'unit price', 'ext price'].agg([np.sum, np.mean, np.std])
|
quantity |
unit price |
ext price |
|
sum |
mean |
std |
sum |
mean |
std |
sum |
mean |
std |
status |
|
|
|
|
|
|
|
|
|
gold |
1755 |
24.375000 |
14.575145 |
3868.12 |
53.723889 |
28.740080 |
97340.01 |
1351.944583 |
1182.657312 |
silver |
3198 |
22.842857 |
14.512843 |
8018.18 |
57.272714 |
26.556242 |
184804.51 |
1320.032214 |
1086.384051 |
bronze |
2307 |
24.284211 |
14.812777 |
5666.72 |
59.649684 |
26.530155 |
137970.53 |
1452.321368 |
1140.105645 |
all_data_st.drop_duplicates(subset=['account number', 'name']).ix[:,[0, 1, 7]].groupby(['status'])['name'].count()
status
gold 4
silver 7
bronze 5
Name: name, dtype: int64