网站更新内容:请访问: https://bigdata.ministep.cn/

html内容为table解析

#html
#内容解析
from lxml import etree
import pandas as pd
import re
def get_data(html):
    html = etree.HTML(html)
    #result = etree.tostring(html)
    #print(result)
    table = html.xpath('//table[contains(@class,"torrents")]')
    # table内容转换成dataframe数据
    result =[]
    for rows in table[0].xpath('./tr'): #备注因为table有表头,所以从第一行开始抓取数据
        #去掉标题行
        elements = rows.xpath('./td')
        tmp_result = [''.join(ele.xpath('.//text()') ) for ele in elements]
        title_id = rows.xpath(".//a[contains(@href,'php?id')]/@href")
        try:
            title_id = re.search('php\?id=(\d+)',str(title_id)).group(1)    
        except:
            title_id = ''
        #title_class  = ''.join(rows.xpath(".//a[contains(@href,'?cat=')]/img/@title"))
        #print(rows.xpath(".//a[contains(@href,'php?id')]/@title"))
        tmp_result.append(title_id)
        #tmp_result.append(title_class)    

        result.append(tmp_result)

    labels = ['类型','标题','评论数','存活时间','大小','种子数','下载数','完成数','发布者','标题id']
    result
    df = pd.DataFrame.from_records(result,columns=labels)
    #删除第一行数据
    df=df.iloc[1:]
    return df
#get_data(html)

存入mongod数据库

import pandas as pd
from pymongo import MongoClient
from sqlalchemy import create_engine
def data_to_dataframe(data):
    from pymongo import MongoClient
    client = MongoClient('67.216.204.220', 27017)
    db = client.pt
    table = db.pt_btschool_net_torrents
    records = data.to_dict('records')
    table.insert_many(records)
    return 'success dataframe_to_mongodb '
#data_to_dataframe(df)

html 内容是table的解析办法:

说明:table中tbody是无效的,使用中不需要用它

import pandas as pd
from lxml import html

url = "http://www.uesp.net/wiki/Skyrim:No_Stone_Unturned"
xpath = "//*[@id=\"mw-content-text\"]/table[3]"

tree = html.parse(url)
table = tree.xpath(xpath)[0]
raw_html = html.tostring(table)

dta = pd.read_html(raw_html, header=0)[0]
dta["completed"] = 0
del dta["Map"]
参考地址:https://gist.github.com/jseabold/5892603

案例:

from lxml import etree
import pandas as pd
table = html.xpath('//table[@id="torrenttable"]')[0]
raw_html = etree.tostring(table)
data = pd.read_html(raw_html, header=0)[0]

读取mongodb数据内容

import pandas as pd
from sqlalchemy import create_engine
def read_mongb():
    from pymongo import MongoClient
    client = MongoClient('67.216.204.220', 27017)
    db = client.pt
    table = db.pt_btschool_net_torrents
    data = pd.DataFrame(pd.DataFrame(list(table.find())))
    return data
data = read_mongb()
data.head()
data.columns
posted @ 2021-04-04 19:40  ministep88  阅读(243)  评论(0编辑  收藏  举报
网站更新内容:请访问:https://bigdata.ministep.cn/