【原创】python 豆瓣采集

新手今天刚学python~~~ 有点凌乱~勉强看吧 只能算是给新手看看,见谅

简单版本的 豆瓣采集美图~~~~~~ 美女天天有 有木有~~~

python 3.4

sqlite3

BeautifulSoup 4.4 

 1 from bs4 import BeautifulSoup
 2 import urllib.request
 3 import time,sched,os
 4 import sqlite3
 5 import sys
 6 
 7 
 8 
 9 #sys.exit()
10 
11 cx = sqlite3.connect('c:\\sqlite\\test.db')
12 global cu
13 cu=cx.cursor()
14 cu.execute('select name from sqlite_master where type=\'table\' order by name;')
15 for ds in cu.fetchall():
16     #print(ds[0])
17     if ds[0] != 'caiji':
18         print("表不存在,开始创建")
19         cu.execute("create table caiji (id INTEGER PRIMARY KEY AUTOINCREMENT,pid integer,nickname text NULL); ")
20     else:
21         print("存在")
22         break
23 #t=('grmlmgjsadf',)
24 #cx.execute("insert into caiji(nickname) values(?)",t)
25 #cx.commit()
26 #cu.execute('select * from caiji where nickname=\''+'grmlmgjsadf'+'\'')
27 #if cu.fetchall():
28 #    print('dsa')
29 
30 
31 #cu.close()
32 #cx.close()
33 
34 path="d:\\imgs\\"
35 ISOTIMEFORMAT='%Y%m%d'
36 
37 
38 def dwonloadimg(uri):
39     temp=time.strftime(ISOTIMEFORMAT, time.localtime())
40     isexists=os.path.exists(path+temp)
41     if not isexists:
42         os.makedirs(path+temp)
43     conn=urllib.request.urlopen(uri)
44     pos=uri.rfind("/")
45     name=uri[pos+1:]
46     f=open(path+temp+'\\'+name,'wb')
47     f.write(conn.read())
48     conn.close()
49     f.close()
50     
51 
52 def Getarticle1(uri):
53     res=urllib.request.urlopen(uri)
54     html=res.read()
55     res.close()
56     str(html,'utf-8')
57     bs=BeautifulSoup(html)
58     imgs=bs.find_all('div',class_="topic-figure cc")
59     for s in imgs:
60         strc=s.find('img').attrs['src']
61         print('图片:',strc)
62         dwonloadimg(strc)
63 def init():
64     print('开始抓取')
65     url="http://www.douban.com/group/haixiuzu/"
66     temp=urllib.request.urlopen(url)
67     html=temp.read()
68     str(html,'utf-8')
69     bs=BeautifulSoup(html)
70     divs=bs.find_all('td',class_='title')
71     for s in divs:
72         uri=s.a["href"]
73         #print(s.a.string,"\n",uri)
74         cu.execute('select * from caiji where nickname=\''+uri+'\'')
75         global cu
76         if not cu.fetchall():
77             print("新文章")
78             cx.execute("insert into caiji(nickname) values(?)",(uri,))
79             cx.commit()
80             Getarticle1(uri)
81        # else:
82         #    print("文章存在")
83     print("结束")
84 
85 
86 while True:
87     init()
88     time.sleep(60)

 

posted @ 2015-09-06 20:00  Red Cat  阅读(951)  评论(0编辑  收藏  举报

Copyright © 2022 LyShark Powered by .NET 6 on Kubernetes
Theme - LyTheme 1.0