python——利用python通过浏览器打开博客页面

自动打开浏览器，并打开指定的网页内容

PS:本程序缺陷，只能打开一页的博客内容，翻页内容待扩展

# coding:utf-8

import webbrowser as web
import os
import time
import random
import urllib2
from bs4 import BeautifulSoup


def getPage(): #伪装成浏览器登陆,获取网页源代码
	url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'
	baseurl='http://blog.csdn.net'
	urlList=[]
	headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}  
	req = urllib2.Request(url=url,headers=headers)
	try:
		html = urllib2.urlopen(req).read()
	except urllib2.HTTPError,e:
		print e.code
		print e.reason
        #将网页内容传给BeautifulSoup解析
	page = BeautifulSoup(html,'lxml')
	items = page.find_all('div',class_ ='list_item list_view')#找到每一个文章item
	for item in items:
		content=item.find('a')
		url=content.get('href')#找到每一个文章的连接
		url=baseurl+url#拼接成可访问的地址
		urlList.append(url)
	return urlList
urls=getPage()#获取到博客地址的url列表


i=0;
while i<len(urls):#根据文章列表的url决定循环次数，也是打开的网页个数
        webs.open_new_tab(urls[i])#通过默认浏览器打开网页
        i=i+1
        time.sleep(1)#打开后，暂停1s，防止电脑卡死
   else:
        time.sleep(2)
        os.system('taskkill /f /I.M Chrome.exe')#全部打开一遍之后，将浏览器关闭，防止进程过多，电脑死掉
        print u"close broswer %d times "%(j)

修改一下代码

实现如下功能：

统计一下个人名下博客文章的数量，并将其按照访问顺序从小到大排序

#coding:utf-8  
import urllib2,re,time,random,os  
from bs4 import BeautifulSoup
import webbrowser as web
import sys  
reload(sys)  
sys.setdefaultencoding('utf-8')  
  
def getPage(): #伪装成浏览器登陆,获取网页源代码  
    url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'  
    baseurl='http://blog.csdn.net' 
    contentList=[]  
    sortlist=[]
    sortlist1=[]
    urlList=[]
    headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}    
    req = urllib2.Request(url=url,headers=headers)  
    try:  
        html = urllib2.urlopen(req).read()  
    except urllib2.HTTPError,e:  
        print e.code  
        print e.reason  
    fd=open('counter.txt','w')  
    page = BeautifulSoup(html,'lxml')
    items = page.find_all('div',class_ ='list_item list_view')  
    print u'总共有文章%d 篇' % len(items)  
    for item in items:  
        aa={}  
        content=item.find('a')
        
        contemtUrl=baseurl+content.get('href')
        #print contemtUrl
          
        read_time=item.find('span',class_ ='link_view')  
        readtime=str(read_time.text.strip())
        #print readtime
  
        readtimeNumber = int(filter(str.isdigit, readtime))
        #print readtimeNumber
        sortlist1.append(readtimeNumber)  
        #time.sleep(2)
        aa['indexs']=readtimeNumber  
        aa['content']=content.text.strip()  
        aa['read_time']=readtime   
        aa['contemtUrl']=contemtUrl  
        sortlist.append(aa)  
    sortlist1.sort()  
    print sortlist1  
      
    for i in sortlist1:  
        for a in sortlist:  
            if int(i) == int(a['indexs']):  
                totalcontent=a['content']+'\t'+a['read_time']+'\t'+a['contemtUrl']  
                print totalcontent  
                fd.write(totalcontent)  
                fd.write('\n')
                urlList.append(a['contemtUrl'])
                contentList.append(totalcontent)  
    fd.close()  
    return urlList  
urls=getPage()

下面的是一个小功能

count=random.randint(3,10)
j=0
while j< count:
    for i in range(5):
        web.open_new_tb(urls[i])
        time.sleep(1)
    os.system('taskkill /f /IM Chrome.exe')
    j = j+1

增加一版python3 版本的内容：

#coding:utf-8  
import urllib,re,time,random,os  
import urllib.request as urllib2
from bs4 import BeautifulSoup
import webbrowser as web
import sys  
from functools import cmp_to_key
def getPage(): #伪装成浏览器登陆,获取网页源代码  
    url = 'https://blog.csdn.net/qiqiyingse?viewmode=contents'  
    baseurl='http://blog.csdn.net' 
    contentList=[]  
    sortlist=[]
    sortlist1=[]
    urlList=[]
    headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}    
    req = urllib2.Request(url=url,headers=headers)  
    try:  
        html = urllib2.urlopen(req).read()  
    except urllib2.HTTPError as e:  
        print (e.code) 
        print (e.reason ) 
    fd=open('counter.txt','w')  
    page = BeautifulSoup(html,'lxml')
    items = page.find_all('div',class_ ='article-item-box csdn-tracking-statistics')  
    print ('总共有文章%d 篇' % len(items)) 
    
    for item in items:  
        aa={}  
        content=item.find('h4').find('a')

        contemtUrl=content.get('href')
        #print (contemtUrl)
        content=content.text.strip()
        #print (content)
        read_time_content=item.find("div",class_="info-box d-flex align-content-center")
   
        
        #read_time=read_time_content.find('span',class_='read_num')
        #readtime=str(read_time.text.strip())
        read_time=read_time_content.find('span',class_='read-num')
        readtime=str(read_time.text.strip())
        # (read_time)
        
        
        readtimeNumber =int(re.sub("\D","",readtime))
        #print (readtimeNumber)
        
        sortlist1.append(readtimeNumber)  
        #time.sleep(2)
        aa['indexs']=readtimeNumber  
        aa['content']=content
        aa['read_time']=readtime   
        aa['contemtUrl']=contemtUrl  
        sortlist.append(aa)  
    sortlist1.sort()
    print (sortlist1) 
    sortlist = sorted(sortlist,key=cmp_to_key(lambda x,y:x['indexs']-y['indexs']) )
     
    for i in sortlist:
        totalcontent=i['content']+'\t'+i['read_time']+'\t'+i['contemtUrl']  
        #print (totalcontent ) 
        fd.write(totalcontent)  
        fd.write('\n')
        urlList.append(i['contemtUrl'])
        contentList.append(totalcontent)
    '''
    sortlist1.sort()
    print (sortlist1) 
    for i in sortlist1:  
        for a in sortlist:  
            if int(i) == int(a['indexs']):  
                totalcontent=a['content']+'\t'+a['read_time']+'\t'+a['contemtUrl']  
                print (totalcontent ) 
                fd.write(totalcontent)  
                fd.write('\n')
                urlList.append(a['contemtUrl'])
                contentList.append(totalcontent)
    '''

    fd.close()  
    return urlList  
    
urls=getPage()


count=random.randint(1,3)
j=0
while j< count:
    for i in range(5):
        web.open_new_tab(urls[i])
        time.sleep(1)
    #os.system('taskkill /f /IM Chrome.exe')
    os.system('taskkill /f /IM SogouExplorer.exe')
    j = j+1

posted @ 2017-03-02 12:32 枫奇丶宛南阅读(75) 评论(0) 编辑收藏举报

刷新页面返回顶部

枫奇丶宛南

python——利用python通过浏览器打开博客页面

公告