【原创】python 豆瓣采集
新手今天刚学python~~~ 有点凌乱~勉强看吧 只能算是给新手看看,见谅
简单版本的 豆瓣采集美图~~~~~~ 美女天天有 有木有~~~
python 3.4
sqlite3
BeautifulSoup 4.4
1 from bs4 import BeautifulSoup 2 import urllib.request 3 import time,sched,os 4 import sqlite3 5 import sys 6 7 8 9 #sys.exit() 10 11 cx = sqlite3.connect('c:\\sqlite\\test.db') 12 global cu 13 cu=cx.cursor() 14 cu.execute('select name from sqlite_master where type=\'table\' order by name;') 15 for ds in cu.fetchall(): 16 #print(ds[0]) 17 if ds[0] != 'caiji': 18 print("表不存在,开始创建") 19 cu.execute("create table caiji (id INTEGER PRIMARY KEY AUTOINCREMENT,pid integer,nickname text NULL); ") 20 else: 21 print("存在") 22 break 23 #t=('grmlmgjsadf',) 24 #cx.execute("insert into caiji(nickname) values(?)",t) 25 #cx.commit() 26 #cu.execute('select * from caiji where nickname=\''+'grmlmgjsadf'+'\'') 27 #if cu.fetchall(): 28 # print('dsa') 29 30 31 #cu.close() 32 #cx.close() 33 34 path="d:\\imgs\\" 35 ISOTIMEFORMAT='%Y%m%d' 36 37 38 def dwonloadimg(uri): 39 temp=time.strftime(ISOTIMEFORMAT, time.localtime()) 40 isexists=os.path.exists(path+temp) 41 if not isexists: 42 os.makedirs(path+temp) 43 conn=urllib.request.urlopen(uri) 44 pos=uri.rfind("/") 45 name=uri[pos+1:] 46 f=open(path+temp+'\\'+name,'wb') 47 f.write(conn.read()) 48 conn.close() 49 f.close() 50 51 52 def Getarticle1(uri): 53 res=urllib.request.urlopen(uri) 54 html=res.read() 55 res.close() 56 str(html,'utf-8') 57 bs=BeautifulSoup(html) 58 imgs=bs.find_all('div',class_="topic-figure cc") 59 for s in imgs: 60 strc=s.find('img').attrs['src'] 61 print('图片:',strc) 62 dwonloadimg(strc) 63 def init(): 64 print('开始抓取') 65 url="http://www.douban.com/group/haixiuzu/" 66 temp=urllib.request.urlopen(url) 67 html=temp.read() 68 str(html,'utf-8') 69 bs=BeautifulSoup(html) 70 divs=bs.find_all('td',class_='title') 71 for s in divs: 72 uri=s.a["href"] 73 #print(s.a.string,"\n",uri) 74 cu.execute('select * from caiji where nickname=\''+uri+'\'') 75 global cu 76 if not cu.fetchall(): 77 print("新文章") 78 cx.execute("insert into caiji(nickname) values(?)",(uri,)) 79 cx.commit() 80 Getarticle1(uri) 81 # else: 82 # print("文章存在") 83 print("结束") 84 85 86 while True: 87 init() 88 time.sleep(60)