python数据分析-网页信息抓取
HTML简述:
import urllib.request; from bs4 import BeautifulSoup; response=urllib.request.urlopen("file:///C:/PA/6.1/html.html"); html=response.read(); html soup=BeautifulSoup(html); soup soup.find("tr"); soup.find_all("tr")
JSON简述:
import json; import urllib.request; from bs4 import BeautifulSoup; response=urllib.request.urlopen("file:///C:/PA/6.2/json.json"); jsonstring=response.read(); jsonobject=json.loads(jsonstring.decode()) jsonobject["employees"] jsonobject["employees"][0] jsonobject["employees"][0]["lastName"]
解析网页:
综合案例分析:
# -*- coding: utf-8 -*- import json; import urllib.request; from pandas import Series; from pandas import DataFrame; from bs4 import BeautifulSoup; response = urllib.request.urlopen('http://item.jd.com/1185291.html'); html = response.read(); soup = BeautifulSoup(html); divSoup = soup.find(id="product-detail-2") data = DataFrame(columns=['Feature', 'Property']) trs = divSoup.find_all('tr'); for tr in trs : tds = tr.find_all('td'); if len(tds)==2: f=tds[0].getText(); p=tds[1].getText(); data = data.append( Series( [f, p], index=['Feature', 'Property'] ), ignore_index=True ); len(data) response = urllib.request.urlopen('http://p.3.cn/prices/get?skuid=J_1185291'); jsonString = response.read(); jsonObject = json.loads(jsonString.decode()) jsonObject[0]['p']
复杂些的:
# -*- coding: utf-8 -*- import json; import urllib.request; from pandas import Series; from pandas import DataFrame; from pandas import read_csv; from bs4 import BeautifulSoup; pids = read_csv("D:\\PA\\6.5\\pids.csv"); #定义统一的列名,下面可以避免重复输入,直接引用即可 pColumns = ['PID', 'Price']; fColumns = ['PID', 'Feature', 'Property']; #定义存储价格和属性的数据框 pData = DataFrame(columns=pColumns); fData = DataFrame(columns=fColumns); #开始遍历我们需要抓取的商品数据 for pid in pids.values: PID = pid[0].astype(str); pUrl = 'http://p.3.cn/prices/get?skuid=J_' + PID; print("开始处理价格数据:" + pUrl); response = urllib.request.urlopen(pUrl); jsonString = response.read(); jsonObject = json.loads(jsonString.decode()); Price = float(jsonObject[0]['p']); pData = pData.append( Series( [PID, Price], index=pColumns ), ignore_index=True ); fUrl = "http://item.jd.com/" + PID + ".html"; print("开始处理属性数据:" + fUrl); response = urllib.request.urlopen(fUrl); html = response.read(); soup = BeautifulSoup(html); divSoup = soup.find(id="product-detail-2"); trs = divSoup.find_all('tr'); for tr in trs : tds = tr.find_all('td'); if len(tds)==2: Feature = tds[0].getText(); Property=tds[1].getText(); fData = fData.append( Series( [PID, Feature, Property], index=fColumns ), ignore_index=True ); print(pData); print(fData);