python爬虫入门
获取整个网页的内容
import requests
url = 'http://www.wise.xmu.edu.cn/people/faculty'
r = requests.get(url)
html = r.content
print html
加入bs4
bs4是一个非常好的解析网页的库。先看bs4里面最常用的几个BeautifulSoup对象的方法。主要是通过HTML的标签和标签里面的参数来定位,然后用特定方法提取数据。
这里提取教职员工的人员链接和姓名。
import requests
from bs4 import BeautifulSoup
url = 'http://www.wise.xmu.edu.cn/people/faculty'
r = requests.get(url)
html = r.content
soup = BeautifulSoup(html, 'html.parser')
div_people_list = soup.find('div', attrs={'class': 'people_list'})
href_list = div_people_list.find_all('a', attrs={'target': '_blank'})
for people in href_list:
people_url = people['href']
people_name = people.get_text().strip()
print people_url, '\t', people_name
存储
# -*- coding: utf-8 -*-
import requests, codecs, csv
from bs4 import BeautifulSoup
def getHTML(url):
r = requests.get(url)
return r.content
def parseHTML(html):
soup = BeautifulSoup(html, 'html.parser')
body = soup.body
company_middle = body.find('div', attrs={'class': 'middle'})
company_list_ct = company_middle.find('div', attrs={'class': 'list-ct'})
company_list = []
for company_ul in company_list_ct.find_all('ul', attrs={'class': 'company-list'}):
for company_li in company_ul.find_all('li'):
company_name = company_li.get_text()
company_url = company_li.a['href']
company_list.append([company_name.encode('utf-8'), company_url.encode('utf-8')])
return company_list
def writeCSV(file_name, data_list):
with open(file_name, 'wb') as f:
writer = csv.writer(f)
for data in data_list:
writer.writerow(data)
if __name__ == '__main__':
url = 'http://www.cninfo.com.cn/cninfo-new/information/companylist'
html = getHTML(url)
data_list = parseHTML(html)
writeCSV('test.csv', data_list)