第一次用python 写的简单爬虫 记录在自己的博客
#python.py from bs4 import BeautifulSoup import urllib.request from MySqlite import MySqlite global g_intid g_intid=0 def GetBlogTileAndName(url): res = urllib.request.urlopen(url) html = res.read() res.close() str(html, 'utf-8') soup=BeautifulSoup(html) divs=soup.find_all(attrs={"class":"postTitle"}) for divname in divs: print("title:=",divname.a.string,"href:=",divname.a["href"]) global g_intid g_intid+=1 x=MySqlite() x.InsertDate(g_intid,divname.a["href"],divname.a.string) def GetBlogPage(url): res = urllib.request.urlopen(url) html = res.read() res.close() str(html, 'utf-8') soup=BeautifulSoup(html) divPager=soup.find(attrs={"class":"pager"}) print(divPager.string)
for i in range(1,8) :
url=r"http://www.cnblogs.com/FCoding/default.html?page="+str(i)
GetBlogTileAndName(url)
#MySqlite.py class MySqlite(object): """description of class""" def __init__(self, *args): return super().__init__(*args) def callstr(self,str): print(str) def InsertDate(self,id,url,title): conn = sqlite3.connect(r"d:\123.db") c=conn.cursor() #try: # c.execute('create table blog (ID intergeer,url text,title text , PRIMARY KEY(ID))') #except ValueError: # print("error My") strExe="insert into blog values ({0}, \"{1}\",\"{2}\")".format(id,url,title) print(id) #c.execute('insert into blog values (last_insert_rowid(),url,title)') c.execute(strExe) conn.commit() c.close() conn.close() def GetDate(self): import sqlite3 conn = sqlite3.connect(r"d:\123.db") c=conn.cursor() res=c.execute("select count(*) from blog") res=c.fetchone() print(res[0]) data=c.execute("select * from blog") for item in data: for ite in item: print(ite) conn.commit() c.close() conn.close()
简述一下功能:
通过urllib 下载网页 使用BeautifulSoup 解析
调用find_all(attrs={"class":"postTitle"})
找到HTML 中所有class=posttitle 的tag
然后遍历 取出title 和href 保存到数据库中
此程序 无容错。新手无笑!