process xlsx with pandas

import pandas as pd
import numpy as np
!ls sales*.xlsx
sales-feb-2014.xlsx
sales-jan-2014.xlsx
sales-mar-2014.xlsx
import glob
glob.glob('sales*.xlsx')
['sales-feb-2014.xlsx', 'sales-jan-2014.xlsx', 'sales-mar-2014.xlsx']
all_data = pd.DataFrame()
for f in glob.glob('sales*.xlsx'):
    df = pd.read_excel(f)
    all_data = all_data.append(df, ignore_index=True)
all_data.head()
account number name sku quantity unit price ext price date
0 383080 Will LLC B1-20000 7 33.69 235.83 2014-02-01 09:04:59
1 412290 Jerde-Hilpert S1-27722 11 21.12 232.32 2014-02-01 11:51:46
2 412290 Jerde-Hilpert B1-86481 3 35.99 107.97 2014-02-01 17:24:32
3 412290 Jerde-Hilpert B1-20000 23 78.90 1814.70 2014-02-01 19:56:48
4 672390 Kuhn-Gusikowski S1-06532 48 55.82 2679.36 2014-02-02 03:45:20
all_data['date'] = pd.to_datetime(all_data['date'])
all_data.head()
account number name sku quantity unit price ext price date
0 383080 Will LLC B1-20000 7 33.69 235.83 2014-02-01 09:04:59
1 412290 Jerde-Hilpert S1-27722 11 21.12 232.32 2014-02-01 11:51:46
2 412290 Jerde-Hilpert B1-86481 3 35.99 107.97 2014-02-01 17:24:32
3 412290 Jerde-Hilpert B1-20000 23 78.90 1814.70 2014-02-01 19:56:48
4 672390 Kuhn-Gusikowski S1-06532 48 55.82 2679.36 2014-02-02 03:45:20
status = pd.read_excel('customer-status.xlsx')
status
account number name status
0 740150 Barton LLC gold
1 714466 Trantow-Barrows silver
2 218895 Kulas Inc bronze
3 307599 Kassulke, Ondricka and Metz bronze
4 412290 Jerde-Hilpert bronze
5 729833 Koepp Ltd silver
6 146832 Kiehn-Spinka silver
7 688981 Keeling LLC silver
8 786968 Frami, Hills and Schmidt silver
9 239344 Stokes LLC gold
10 672390 Kuhn-Gusikowski silver
11 141962 Herman LLC gold
12 424914 White-Trantow silver
13 527099 Sanford and Sons bronze
14 642753 Pollich LLC bronze
15 257198 Cronin, Oberbrunner and Spencer gold
all_data_st = pd.merge(all_data, status, how='left')
all_data_st.head()
account number name sku quantity unit price ext price date status
0 383080 Will LLC B1-20000 7 33.69 235.83 2014-02-01 09:04:59 NaN
1 412290 Jerde-Hilpert S1-27722 11 21.12 232.32 2014-02-01 11:51:46 bronze
2 412290 Jerde-Hilpert B1-86481 3 35.99 107.97 2014-02-01 17:24:32 bronze
3 412290 Jerde-Hilpert B1-20000 23 78.90 1814.70 2014-02-01 19:56:48 bronze
4 672390 Kuhn-Gusikowski S1-06532 48 55.82 2679.36 2014-02-02 03:45:20 silver
all_data_st[all_data_st['account number']==737550]
account number name sku quantity unit price ext price date status
15 737550 Fritsch, Russel and Anderson S1-47412 40 51.01 2040.40 2014-02-05 01:20:40 NaN
25 737550 Fritsch, Russel and Anderson S1-06532 34 18.69 635.46 2014-02-07 09:22:02 NaN
66 737550 Fritsch, Russel and Anderson S1-27722 15 70.23 1053.45 2014-02-16 18:24:42 NaN
78 737550 Fritsch, Russel and Anderson S2-34077 26 93.35 2427.10 2014-02-20 18:45:43 NaN
80 737550 Fritsch, Russel and Anderson S1-93683 31 10.52 326.12 2014-02-21 13:55:45 NaN
91 737550 Fritsch, Russel and Anderson S2-82423 46 67.67 3112.82 2014-02-24 08:35:31 NaN
117 737550 Fritsch, Russel and Anderson S2-82423 14 81.92 1146.88 2014-01-03 19:07:37 NaN
122 737550 Fritsch, Russel and Anderson B1-53102 23 71.56 1645.88 2014-01-04 08:57:48 NaN
134 737550 Fritsch, Russel and Anderson B1-53636 42 42.06 1766.52 2014-01-08 00:02:11 NaN
140 737550 Fritsch, Russel and Anderson S1-27722 20 29.54 590.80 2014-01-09 13:20:40 NaN
150 737550 Fritsch, Russel and Anderson S1-93683 22 71.68 1576.96 2014-01-11 23:47:36 NaN
171 737550 Fritsch, Russel and Anderson S2-77896 41 38.43 1575.63 2014-01-16 22:39:11 NaN
177 737550 Fritsch, Russel and Anderson S2-82423 13 24.98 324.74 2014-01-18 06:32:00 NaN
209 737550 Fritsch, Russel and Anderson B1-53102 34 99.48 3382.32 2014-01-22 22:00:08 NaN
223 737550 Fritsch, Russel and Anderson B1-65551 49 19.14 937.86 2014-01-27 01:59:54 NaN
232 737550 Fritsch, Russel and Anderson S1-65481 47 45.71 2148.37 2014-01-28 15:49:45 NaN
238 737550 Fritsch, Russel and Anderson S1-50961 1 34.81 34.81 2014-01-30 23:56:20 NaN
245 737550 Fritsch, Russel and Anderson B1-50809 20 50.11 1002.20 2014-03-01 23:47:17 NaN
258 737550 Fritsch, Russel and Anderson S2-83881 12 63.60 763.20 2014-03-04 15:26:20 NaN
336 737550 Fritsch, Russel and Anderson S2-11481 47 49.22 2313.34 2014-03-22 23:44:18 NaN
339 737550 Fritsch, Russel and Anderson S1-93683 22 18.19 400.18 2014-03-23 10:33:33 NaN
379 737550 Fritsch, Russel and Anderson B1-65551 12 56.24 674.88 2014-03-31 08:43:24 NaN
all_data_st['status'].fillna('bronze')
all_data_st.head()
account number name sku quantity unit price ext price date status
0 383080 Will LLC B1-20000 7 33.69 235.83 2014-02-01 09:04:59 NaN
1 412290 Jerde-Hilpert S1-27722 11 21.12 232.32 2014-02-01 11:51:46 bronze
2 412290 Jerde-Hilpert B1-86481 3 35.99 107.97 2014-02-01 17:24:32 bronze
3 412290 Jerde-Hilpert B1-20000 23 78.90 1814.70 2014-02-01 19:56:48 bronze
4 672390 Kuhn-Gusikowski S1-06532 48 55.82 2679.36 2014-02-02 03:45:20 silver
all_data_st['status'] = all_data_st['status'].astype('category')
all_data_st.head()
account number name sku quantity unit price ext price date status
0 383080 Will LLC B1-20000 7 33.69 235.83 2014-02-01 09:04:59 NaN
1 412290 Jerde-Hilpert S1-27722 11 21.12 232.32 2014-02-01 11:51:46 bronze
2 412290 Jerde-Hilpert B1-86481 3 35.99 107.97 2014-02-01 17:24:32 bronze
3 412290 Jerde-Hilpert B1-20000 23 78.90 1814.70 2014-02-01 19:56:48 bronze
4 672390 Kuhn-Gusikowski S1-06532 48 55.82 2679.36 2014-02-02 03:45:20 silver
all_data_st.dtypes
account number             int64
name                      object
sku                       object
quantity                   int64
unit price               float64
ext price                float64
date              datetime64[ns]
status                  category
dtype: object
all_data_st.sort_values('status').head()
account number name sku quantity unit price ext price date status
0 383080 Will LLC B1-20000 7 33.69 235.83 2014-02-01 09:04:59 NaN
258 737550 Fritsch, Russel and Anderson S2-83881 12 63.60 763.20 2014-03-04 15:26:20 NaN
261 383080 Will LLC S1-93683 28 90.86 2544.08 2014-03-05 05:11:49 NaN
66 737550 Fritsch, Russel and Anderson S1-27722 15 70.23 1053.45 2014-02-16 18:24:42 NaN
201 383080 Will LLC B1-53636 37 14.40 532.80 2014-01-20 22:18:21 NaN
all_data_st['status'].cat.set_categories(['gold', 'silver', 'bronze'], inplace=True)
all_data_st.sort_values('status').head()
account number name sku quantity unit price ext price date status
0 383080 Will LLC B1-20000 7 33.69 235.83 2014-02-01 09:04:59 NaN
134 737550 Fritsch, Russel and Anderson B1-53636 42 42.06 1766.52 2014-01-08 00:02:11 NaN
289 383080 Will LLC S1-82801 3 77.06 231.18 2014-03-11 16:38:10 NaN
122 737550 Fritsch, Russel and Anderson B1-53102 23 71.56 1645.88 2014-01-04 08:57:48 NaN
299 604255 Halvorson, Crona and Champlin S1-47412 27 25.69 693.63 2014-03-14 16:01:34 NaN
all_data_st['status'].describe()
count        307
unique         3
top       silver
freq         140
Name: status, dtype: object
all_data_st.groupby(['status'])['quantity', 'unit price', 'ext price'].mean()
quantity unit price ext price
status
gold 24.375000 53.723889 1351.944583
silver 22.842857 57.272714 1320.032214
bronze 24.284211 59.649684 1452.321368
all_data_st.groupby(['status'])['quantity', 'unit price', 'ext price'].agg([np.sum, np.mean, np.std])
quantity unit price ext price
sum mean std sum mean std sum mean std
status
gold 1755 24.375000 14.575145 3868.12 53.723889 28.740080 97340.01 1351.944583 1182.657312
silver 3198 22.842857 14.512843 8018.18 57.272714 26.556242 184804.51 1320.032214 1086.384051
bronze 2307 24.284211 14.812777 5666.72 59.649684 26.530155 137970.53 1452.321368 1140.105645
all_data_st.drop_duplicates(subset=['account number', 'name']).ix[:,[0, 1, 7]].groupby(['status'])['name'].count()
status
gold      4
silver    7
bronze    5
Name: name, dtype: int64
posted @ 2017-04-21 01:24  idlewith  阅读(271)  评论(0编辑  收藏  举报