python数据分析-网页信息抓取

HTML简述:

 

import urllib.request;
from bs4 import BeautifulSoup;
response=urllib.request.urlopen("file:///C:/PA/6.1/html.html");

html=response.read();
html

soup=BeautifulSoup(html);
soup

soup.find("tr");
soup.find_all("tr")
View Code

JSON简述:

 

import json;
import urllib.request;
from bs4 import BeautifulSoup;
response=urllib.request.urlopen("file:///C:/PA/6.2/json.json");


jsonstring=response.read();

jsonobject=json.loads(jsonstring.decode())

jsonobject["employees"]
jsonobject["employees"][0]
jsonobject["employees"][0]["lastName"]
View Code

 

解析网页:

 综合案例分析:

# -*- coding: utf-8 -*-
import json;
import urllib.request;

from pandas import Series;
from pandas import DataFrame;

from bs4 import BeautifulSoup;

response = urllib.request.urlopen('http://item.jd.com/1185291.html');

html = response.read();

soup = BeautifulSoup(html);

divSoup = soup.find(id="product-detail-2")

data = DataFrame(columns=['Feature', 'Property'])

trs = divSoup.find_all('tr');

for tr in trs :
    tds = tr.find_all('td');
    if len(tds)==2:
        f=tds[0].getText();
        p=tds[1].getText();  
        data = data.append(
            Series(
                [f, p], 
                index=['Feature', 'Property']
            ), ignore_index=True
        );

len(data)   

response = urllib.request.urlopen('http://p.3.cn/prices/get?skuid=J_1185291');
jsonString = response.read();

jsonObject = json.loads(jsonString.decode())

jsonObject[0]['p']
View Code

复杂些的:

# -*- coding: utf-8 -*-
import json;
import urllib.request;

from pandas import Series;
from pandas import DataFrame;
from pandas import read_csv;
from bs4 import BeautifulSoup;

pids = read_csv("D:\\PA\\6.5\\pids.csv");

#定义统一的列名,下面可以避免重复输入,直接引用即可
pColumns = ['PID', 'Price'];
fColumns = ['PID', 'Feature', 'Property'];

#定义存储价格和属性的数据框
pData = DataFrame(columns=pColumns);
fData = DataFrame(columns=fColumns);

#开始遍历我们需要抓取的商品数据
for pid in pids.values:
    PID = pid[0].astype(str);

    pUrl = 'http://p.3.cn/prices/get?skuid=J_' + PID;
    print("开始处理价格数据:" + pUrl);
    
    response = urllib.request.urlopen(pUrl);
    jsonString = response.read();

    jsonObject = json.loads(jsonString.decode());

    Price = float(jsonObject[0]['p']);
    pData = pData.append(
        Series(
            [PID, Price], 
            index=pColumns
        ), ignore_index=True
    );
    
    fUrl = "http://item.jd.com/" + PID + ".html";
    print("开始处理属性数据:" + fUrl);
    response = urllib.request.urlopen(fUrl);

    html = response.read();
    soup = BeautifulSoup(html);

    divSoup = soup.find(id="product-detail-2");
    trs = divSoup.find_all('tr');

    for tr in trs :
        tds = tr.find_all('td');
        if len(tds)==2:
            Feature = tds[0].getText();
            Property=tds[1].getText();  
            fData = fData.append(
                Series(
                    [PID, Feature, Property], 
                    index=fColumns
                ), ignore_index=True
            );

print(pData);
print(fData);
View Code

 

posted @ 2018-06-06 16:34  秋雨秋雨秋雨  阅读(1561)  评论(0编辑  收藏  举报