爬虫----配合多线程的思路
from pyquery import PyQuery as pq
import os
from queue import Queue
from threading import Thread
class txtparser(Thread):
def __init__(self,queue):
Thread.__init__(self)
self.queue = queue
#文件夹目录
def run(self):
#path = "E:\辰东\ZheTian\\395020.html"
while True:
content = self.queue.get()
html=""
try:
with open (content,"r",encoding='utf-8') as reader:
html = reader.read()
except Exception:
with open (content,"r") as reader:
html = reader.read()
#print(html)
try:
doc = pq(html)
title = doc("#main .content_read .box_con .bookname h1")
print("标题=====",title.text())
clipname = content.split("\\")[-2]
#junkp = doc(".content").find('p').remove()
passage = doc("#content").text()
except Exception:
continue
print("正文======",str.replace(passage,"<br/>",""))
try:
clipname = str.replace(clipname,"《","")
clipname = str.replace(clipname,"》","")
except Exception:
clipname = clipname
if os.path.exists(clipname):
pass
else:
os.mkdir(clipname)
try:
with open(clipname+"\\"+title.text()+".txt","w",encoding="gbk") as writer:
writer.write(passage)
print("完成{}的写入".format(clipname+"\\"+title.text()+".txt"))
except Exception:
with open("errorecorder.log","a") as writer:
writer.write(clipname+"\\"+title.text()+".txt"+"\r")
print("文件夹名称======",clipname)
def launchtxtparser(parentdir):
rootdir = parentdir
queue = Queue()
print(rootdir)
for i in os.listdir(rootdir):
print(i)
if os.path.isdir(rootdir+"\\"+i):
print(rootdir+"\\"+i)
g = (k for k in os.listdir(rootdir+"\\"+i))
print(next(g))
while True:
try:
filename = next(g)
fullfilename = rootdir+"\\"+i+"\\"+filename
queue.put(fullfilename)
print(fullfilename)
except StopIteration:
print("ooooophs~处理完毕")
break
for i in range(10):
cpc = txtparser(queue)
cpc.daemon=True
cpc.start()
queue.join()
#print(os.listdir(rootdir))
launchtxtparser("E:\月关")