import numpy as np
from pandas import Series
data = {'Davae':'dave@google.com','Steve':'steve@gmail.com','Rob':'rob@gmail.com','Wes':np.nan}
data

{'Davae': 'dave@google.com',
 'Steve': 'steve@gmail.com',
 'Rob': 'rob@gmail.com',
 'Wes': nan}

data2 = Series(data)
data2

Davae    dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

data2.isnull()

Davae    False
Steve    False
Rob      False
Wes       True
dtype: bool

通过data.map，所有字符串和正则表达式方法都能被应用于各个值

对象下面的属性，可以取得所有的字符串

data2.str.contains('gmail')

Davae    False
Steve     True
Rob       True
Wes        NaN
dtype: object

# 匹配规则
regex = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
# 映射匹配
res = data2.str.findall(regex,flags = re.IGNORECASE)
res

Davae    [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

# 直接使用get属性索引取值
res_first = res.str.get(0)
res_first

Davae    (dave, google, com)
Steve    (steve, gmail, com)
Rob        (rob, gmail, com)
Wes                      NaN
dtype: object

res_second = res_first.str.get(1)
res_second

Davae    google
Steve     gmail
Rob       gmail
Wes         NaN
dtype: object

res_second.str[:2]

Davae     go
Steve     gm
Rob       gm
Wes      NaN
dtype: object

示例： USDA食品数据库

import json
import pandas as pd
db = json.load(open(r'C:\Users\1\Desktop\Python\练习代码\基础模块面向对象网络编程\day2\food.json'))
len(db)

db[0].keys()

dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])

db[0]['nutrients'][0]

{'value': 25.18,
 'units': 'g',
 'description': 'Protein',
 'group': 'Composition'}

nutrients = pd.DataFrame(db[0]['nutrients'])
nutrients[:7]

description	group	units	value
0	Protein	                    Composition	 g	25.18
1	Total lipid (fat)	        Composition	 g	29.20
2	Carbohydrate, by difference	Composition	 g	3.06
3	Ash	                          Other	     g	3.28
4	Energy	                     Energy	   kcal	376.00
5	Water	                    Composition	 g	39.28
6	Energy	                        Energy	kJ	1573.00


info_keys = ['description','group','id','manufacturer']
info = pd.DataFrame(db, columns=info_keys)
info.head()

       description	                          group	    id	     manufacturer
0	Cheese, caraway	                    Dairy and Egg Products	1008	
1	Cheese, cheddar	                    Dairy and Egg Products	1009	
2	Cheese, edam	                    Dairy and Egg Products	1018	
3	Cheese, feta	                    Dairy and Egg Products	1019	
4	Cheese, mozzarella, part skim milk	Dairy and Egg Products	1028

info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6636 entries, 0 to 6635
Data columns (total 4 columns):
description     6636 non-null object
group           6636 non-null object
id              6636 non-null int64
manufacturer    5195 non-null object
dtypes: int64(1), object(3)
memory usage: 207.5+ KB


# 查看食品的分类情况
pd.value_counts(info['group'])

Vegetables and Vegetable Products    812
Beef Products                        618
Baked Products                       496
Breakfast Cereals                    403
Legumes and Legume Products          365
Fast Foods                           365
Lamb, Veal, and Game Products        345
Sweets                               341
Fruits and Fruit Juices              328
Pork Products                        328
Beverages                            278
Soups, Sauces, and Gravies           275
Finfish and Shellfish Products       255
Baby Foods                           209
Cereal Grains and Pasta              183
Ethnic Foods                         165
Snacks                               162
Nut and Seed Products                128
Poultry Products                     116
Sausages and Luncheon Meats          111
Dairy and Egg Products               107
Fats and Oils                         97
Meals, Entrees, and Sidedishes        57
Restaurant Foods                      51
Spices and Herbs                      41
Name: group, dtype: int64

nutrients = []
for rec in db:
    fnuts = pd.DataFrame(rec['nutrients'])
    fnuts['id'] = rec['id']
    nutrients.append(fnuts)
# 拼接所有的营养成分
nutrients = pd.concat(nutrients, ignore_index=True)
nutrients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 389355 entries, 0 to 389354
Data columns (total 5 columns):
description    389355 non-null object
group          389355 non-null object
units          389355 non-null object
value          389355 non-null float64
id             389355 non-null int64
dtypes: float64(1), int64(1), object(3)
memory usage: 14.9+ MB

# 去重,统计重复的行
nutrients.duplicated().sum()

14179

# 直接得到去重的结果
nutrients = nutrients.drop_duplicates()
# 这是营养成分的描述和分组，上面还有食物的描述和分组
nutrients.head()

    description	                    group	 units	value	id
0	Protein	                     Composition	g	25.18	1008
1	Total lipid (fat)	         Composition	g	29.20	1008
2	Carbohydrate, by difference	 Composition	g	3.06	1008
3	Ash	                              Other	    g	3.28	1008
4	Energy	                        Energy	   kcal	376.00	1008

# 为了便于区别，需要重新命名
col_mapping = {'description':'food',
                'group':'fgroup'
              }
# 食物的重命名
info = info.rename(columns=col_mapping, copy=False)
info.head()

            food	                         fgroup	              id	manufacturer
0	Cheese, caraway	                    Dairy and Egg Products	1008	
1	Cheese, cheddar	                    Dairy and Egg Products	1009	
2	Cheese, edam	                    Dairy and Egg Products	1018	
3	Cheese, feta	                    Dairy and Egg Products	1019	
4	Cheese, mozzarella, part skim milk	Dairy and Egg Products	1028

# 营养成分的重命名
col_mapping = {'description':'nutrient',
               'group':'nutgroup'
              }
nutrients = nutrients.rename(columns=col_mapping, copy=False)
nutrients.head()

          nutrient	               nutgroup	units	value	id
0	Protein	                    Composition	   g	25.18	1008
1	Total lipid (fat)	        Composition	   g	29.20	1008
2	Carbohydrate, by difference	Composition	   g	3.06	1008
3	Ash	                              Other	   g	3.28	1008
4	Energy	                         Energy	kcal	376.00	1008

# 两表合一,on指定两表都有列名，用外连
ndata = pd.merge(nutrients, info, on='id', how='outer')
ndata.head()

    nutrient	                  nutgroup	  units	    value	id	         food	           fgroup	          manufacturer
0	Protein	                     Composition	g	   25.18	1008	Cheese, caraway	   Dairy and Egg Products	
1	Total lipid (fat)	         Composition	g	   29.20	1008	Cheese, caraway	   Dairy and Egg Products	
2	Carbohydrate, by difference	 Composition	g	    3.06	1008	Cheese, caraway	   Dairy and Egg Products	
3	Ash	                          Other	        g	    3.28	1008	Cheese, caraway    Dairy and Egg Products	
4	Energy	                      Energy	    kcal	376.00	1008	Cheese, caraway	   Dairy and Egg Products	

# 按食物和营养成分分组，得到各食物营养成分最多的食物
by_nutrient = ndata.groupby(['nutrient','fgroup'])
get_maximum = lambda x:x.xs(x.value.idxmax())
max_foods = by_nutrient.apply(get_maximum)
max_foods.head()

# 只看其中的value和food
max_foods[['value','food']].head()

		                                                     value	         food
nutrient	             fgroup		
Adjusted Protein	      Sweets	                        12.900	Baking chocolate, unsweetened, squares
                         Vegetables and Vegetable Products	2.180	Mushrooms, white, raw
Alanine	                 Baby Foods	                        0.911	Babyfood, meat, ham, junior
                         Baked Products	                    2.320	Leavening agents, yeast, baker's, active dry
                         Beef Products	                    2.254	Beef, cured, breakfast strips, cooked

posted on 2018-12-05 17:30 进击中的青年阅读(302) 评论(0) 编辑收藏举报

刷新页面返回顶部

导航

字符串操作

正则表达式

如果想避免正则表达式中不需要的转义('')，则可以使用原始字符串字面量如r'C:\x'

pandas中矢量化的字符串函数

通过data.map，所有字符串和正则表达式方法都能被应用于各个值

对象下面的属性，可以取得所有的字符串

示例： USDA食品数据库