使用bs4对海投网内容信息进行提取并存入mongodb数据库

example: http://xyzp.haitou.cc/article/722427.html

首先是直接下载好每个页面，可以使用 os.system( "wget "+str(url)) 或者urllib2.urlopen(url) ，很简单不赘述。

然后，重头戏，进行信息抽取：

#!/usr/bin/env python
# coding=utf-8

from bs4 import BeautifulSoup
import codecs
import sys
import os
reload(sys)
sys.setdefaultencoding("utf-8")
import re

from pymongo import MongoClient

def get_jdstr(fname):
    soup = ""
    retdict = {}
    with open(fname) as fr:
        soup = BeautifulSoup(fr.read().replace('""','"'))
    
    jdstr = soup.get_text()
    
    retdict["inc_name"] = soup.title.string.split()[0]
    retdict["page_content"] = soup.find_all("div","panel-body panel-body-text")[0].get_text()
    retdict["index_url"] = re.search("http://xyzp.haitou.cc/article/\d+.html",jdstr).group()
    retdict["info_from"] = soup.find_all("p","text-ellipsis")[0].contents[1].get_text()
    retdict["workplace"] = soup.find_all("p","text-ellipsis")[1].contents[1].get_text()
    retdict["info_tag"] = soup.find_all("p","text-ellipsis")[2].contents[1].get_text()
    retdict["pub_time"] = soup.find_all("p","text-ellipsis")[3].contents[1].get_text()

    return retdict



def JD_extr():
    fnames = [ fname  for fname in os.listdir("./") if fname.endswith(".html") ]
    fw = codecs.open("tmp_jd_haitou_clean.csv","w","utf-8")
    res = []
    for fname in fnames[1:500]:
        tmp = []
        retdict =  get_jdstr(fname)
        res.append(retdict)
        for k,v in retdict.iteritems():
            tmp.append(v)
        fw.write(" , ".join(tmp)+"\n")
        fw.write("==="*20+"\n") 
    print fname,"done!"
    return res



def change2html():
    fnames = [ fname for fname in os.listdir("./") if fname.endswith(".txt") ]
    for fname in fnames:
        cmd = "mv "+str(fname) +" "+fname[:-3]+"html"
        print cmd
        os.system(cmd)


def store2mongodb():
    client = MongoClient("localhost",27017)
    db = client.JD_Haitou
    
    documents = JD_extr()
    for d in documents:
        db.haitouJD.insert(d)

    mycol = db["haitouJD"]
    print mycol.count()



def split_jd_test_data(fname='./tmp_jd_haitou_clean.csv'):
    fw = codecs.open('./split_jd_res.csv','w','utf-8')
    fr = codecs.open(fname,'r','utf-8')
    indexurl = re.compile("http://xyzp.haitou.cc/article/\d+.html")
    for line in fr: 
        if indexurl.search(line):
            url = indexurl.search(line).group()
            cnt = '1'  #默认为1
            fw.write(url+"\t"+cnt+"\n")
    fr.close()
    fw.close()




if __name__ == "__main__":
　　 JD_extr()  # 抽取后存入文件
    store2mongodb()
    split_jd_test_data()
    print "done"

posted on 2015-09-29 17:11 星空守望者--jkmiao 阅读(436) 评论(0) 编辑收藏举报