Python 规范化LinkedIn用户联系人的职位名
CODE:
#!/usr/bin/python # -*- coding: utf-8 -*- ''' Created on 2014-8-19 @author: guaguastd @name: job_title_standard.py ''' import os import csv from collections import Counter from operator import itemgetter from prettytable import PrettyTable # specify csv directory CSV_FILE = os.path.join(r"E:", "\\", "eclipse", "LinkedIn", "dfile", "my_connections.csv") # define a set of transforms that converts the first item # to the second item transforms = [ ('Sr.', 'Senior'), ('Sr', 'Senior'), ('Jr.', 'Junior'), ('Jr', 'Junior'), ('CEO', 'Chief Executive Officer'), ('COO', 'Chief Operating Officer'), ('CTO', 'Chief Technology Officer'), ('CFO', 'Chief Finance Officer'), ('VP', 'Vice President'), ] csvReader = csv.DictReader(open(CSV_FILE), delimiter=',', quotechar='"') contacts = [row for row in csvReader] # Read in a list of titles and split # apart any combined titles like "President/CEO." # "President & CEO", "President and CEO" titles = [] for contact in contacts: titles.extend([t.strip() for t in contact['Job Title'].split('/') if contact['Job Title'].strip() != '']) # Replace common/known abbreviations for i, _ in enumerate(titles): for transform in transforms: titles[i] = titles[i].replace(*transform) # Print out a table of titles sorted by frequency pt = PrettyTable(field_names=['Title', 'Freq']) pt.align = 'l' c = Counter(titles) [pt.add_row([title, freq]) for (title, freq) in sorted(c.items(), key=itemgetter(1), reverse=True) if freq > 0] print pt # Print out a table of tokens sorted by frequency tokens = [] for title in titles: tokens.extend([t.strip(',') for t in title.split()]) pt = PrettyTable(field_names=['Token', 'Freq']) pt.align = 'l' c = Counter(tokens) [pt.add_row([token, freq]) for (token, freq) in sorted(c.items(), key=itemgetter(1), reverse=True) if freq > 0 and len(token) > 2] print pt
RESULT:
+-----------------------------------+------+ | Title | Freq | +-----------------------------------+------+ | Senior Software Developer | 1 | | Sales Manager | 1 | | Software Manager | 1 | | Online Marketing Manager | 1 | | Senior Consultant | 1 | | Chief Executive Officer & Founder | 1 | | Director | 1 | | S | 1 | | Student | 1 | | Senior Software Engineer | 1 | | ???| 1 | +-----------------------------------+------+ +------------+------+ | Token | Freq | +------------+------+ | Manager | 3 | | Senior | 3 | | Software | 3 | | Marketing | 1 | | Founder | 1 | | Consultant | 1 | | Executive | 1 | | Sales | 1 | | Developer | 1 | | Director | 1 | | Chief | 1 | | Officer | 1 | | Student | 1 | | Online | 1 | | ???
| 1 | | Engineer | 1 | +------------+------+