groupby()把迭代器中相邻的重复元素挑出来放在一起:
import itertools for key, group in itertools.groupby('AAABBBCCAAA'): print key, list(group) #因为group是一个迭代器,所以这里要用这里要用list()函数 A ['A', 'A', 'A'] B ['B', 'B', 'B'] C ['C', 'C'] A ['A', 'A', 'A']
挑选规则是通过函数完成的,只要作用于函数的两个元素返回的值相等,这两个元素就被认为是同一组的,而函数返回值作为组的key
例子1:将相同国家的人员信息进行归纳
d1={'name':'Lilei','age':15,'country':'China'}
d2={'name':'jack','age':19,'country':'USA'}
d3={'name':'苍老师','age':22,'country':'JP'}
d4={'name':'tom','age':22,'country':'USA'}
d5={'name':'lucy','age':22,'country':'USA'}
d6={'name':'Hanmeimei','age':15,'country':'China'}
lst=[d1,d2,d3,d4,d5,d6]
from itertools import groupby #必须先排序,才可以分组 lst.sort(key=lambda x:x['country']) print(lst) # [{'name': 'Lilei', 'age': 15, 'country': 'China'}, {'name': 'Hanmeimei', 'age': 15, 'country': 'China'}, {'name': '苍老师', 'age': 22, 'country': 'JP'}, {'name': 'jack', 'age': 19, 'country': 'USA'}, {'name': 'tom', 'age': 22, 'country': 'USA'}, {'name': 'lucy', 'age': 22, 'country': 'USA'}] lst_g = groupby(lst,key=lambda x:x['country']) print(list(lst_g)) # [('China', <itertools._grouper object at 0x000002ACC7105A90>), ('JP', <itertools._grouper object at 0x000002ACC7105B00>), ('USA', <itertools._grouper object at 0x000002ACC7105EB8>)] for c, g in lst_g: print({c:[v for v in g]}) # {'China': [{'name': 'Lilei', 'age': 15, 'country': 'China'}, {'name': 'Hanmeimei', 'age': 15, 'country': 'China'}]} # {'JP': [{'name': '苍老师', 'age': 22, 'country': 'JP'}]} # {'USA': [{'name': 'jack', 'age': 19, 'country': 'USA'}, {'name': 'tom', 'age': 22, 'country': 'USA'}, {'name': 'lucy', 'age': 22, 'country': 'USA'}]}
例子2:归纳列表中连续的数字
from itertools import groupby lst = [2, 3, 5, 6, 7, 8,1, 11, 12, 13,15,27,28,29] lst.sort() print(lst) # [1, 2, 3, 5, 6, 7, 8, 11, 12, 13, 15, 27, 28, 29] print(list(enumerate(lst))) # [(0, 1), (1, 2), (2, 3), (3, 5), (4, 6), (5, 7), (6, 8), (7, 11), (8, 12), (9, 13), (10, 15), (11, 27), (12, 28), (13, 29)] # 相连的整数与序号的差值是相等的,所以可以归纳为一组 # for k, g in groupby(enumerate(lst), key=lambda x:x[1]-x[0]): # print(list(g)) # [(0, 1), (1, 2), (2, 3)] # [(3, 5), (4, 6), (5, 7), (6, 8)] # [(7, 11), (8, 12), (9, 13)] # [(10, 15)] # [(11, 27), (12, 28), (13, 29)] for k, g in groupby(enumerate(lst), key=lambda x:x[1]-x[0]): print([v for i,v in g]) # [1, 2, 3] # [5, 6, 7, 8] # [11, 12, 13] # [15] # [27, 28, 29]
例子3:归纳列表中连续的ip
import ipaddress ip_list = [ '10.16.49.113', '10.202.255.127', '10.202.255.125', '10.202.255.126', '10.202.255.145', '10.202.255.175', '10.202.255.174', '10.202.255.144', '10.202.255.173' ] ip_list_int = [int(ipaddress.ip_address(ip)) for ip in ip_list] ip_list_int.sort() # print(ip_list_int) # [168833393, 181075837, 181075838, 181075839, 181075856, 181075857, 181075885, 181075886, 181075887] rst = [] for i,j in groupby(enumerate(ip_list_int), key=lambda x:x[1]-x[0]): # print(list(j)) # [(0, 168833393)] # [(1, 181075837), (2, 181075838), (3, 181075839)] # [(4, 181075856), (5, 181075857)] # [(6, 181075885), (7, 181075886), (8, 181075887)] # print([v for k,v in j]) # [168833393, 181075837, 181075838, 181075839, 181075856, 181075857, 181075885, 181075886, 181075887] # [168833393] # [181075837, 181075838, 181075839] # [181075856, 181075857] # [181075885, 181075886, 181075887] ip_range_list = [v for k,v in j] if len(ip_range_list)>1: ip_range = ipaddress.summarize_address_range(ipaddress.ip_address(ip_range_list[0]),ipaddress.ip_address(ip_range_list[-1])) for ip_summ in ip_range: rst.append(str(ip_summ)) else: rst.append(str(ipaddress.ip_address(ip_range_list[0]))) print(rst) # ['10.16.49.113', '10.202.255.125/32', '10.202.255.126/31', '10.202.255.144/31', '10.202.255.173/32', '10.202.255.174/31']