先记录一下 idea安装Python的语言支持插件后的操作:我用的是windows环境、windows环境、windows环境。
首先 下载 Anaconda3 的可执行文件 下载地址
安装完成后,打开cms 输入 :conda info --env 查看下环境,默认只有一个base;
下面那个py37是后来建的,创建命令conda create -n py37
删除环境谨慎执行:conda remove -n py37
激活环境:activate base
关闭环境:deactivate base
#!/usr/bin/python3 print('hello world')
TypeError: can’t use a string pattern on a bytes-like object.
1 | html = html.decode( 'utf-8' ) |
如果用 urllib.request.urlopen 方式打开一个URL,服务器端只会收到一个单纯的对于该页面访问的请求,但是服务器并不知道发送这个请求使用的浏览器,操作系统,硬件平台等信息,而缺失这些信息的请求往往都是非正常的访问,例如爬虫.
def getHtml(url): u = urllib.request.URLopener() # Python 3: urllib.request.URLOpener u.addheaders = [] u.addheader( 'Accept', '*/*') u.addheader('Accept-Language','en-US,en;q=0.8') u.addheader( 'Cache-Control', 'max-age=0') u.addheader( 'User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36') u.addheader( 'Connection', 'keep-alive') u.addheader( 'Referer', 'http://www.baidu.com/') page=u.open(url) html = page.read() html = html.decode('utf-8',"ignore") page.close() return html
4.用python3版本,引入request的下载函数 urlretrieve时候,可以这么做,举一反三,可以减少代码量
from urllib.request import urlretrieve
'utf-8' codec can't decode byte 0xd7 in position 309: invalid continuation byte
6.int 型变量 index 转 str类型---->str(index)
相反:str类型变量 string 转int类型 -----> int(str) --------10进制下
def Schedule(a,b,c):
per = 100.0 * a * b / c
if per>100:
per = 100
print('%.2f%%' % per)
# coding:utf-8 import requests from bs4 import BeautifulSoup import os # 创建一个文件夹名称 FileName = 'mm' def dd(): if not os.path.exists(os.path.join(os.getcwd(), FileName)): # 新建文件夹 os.mkdir(os.path.join(os.getcwd(),FileName)) print(u'建了一个名字叫做', FileName, u'的文件夹!') else: print(u'名字叫做', FileName, u'的文件夹已经存在了!') url = 'http://www.xiaohuar.com/list-1-1.html' html = requests.get(url).content # 返回html # html = html.decode('utf-8') soup = BeautifulSoup(html,'html.parser') # BeautifulSoup对象 jpg_data = soup.find_all('img',width="210") # 找到图片信息 index = 1 for i in jpg_data: deindex = str(index) + "a" data = i['src'] # 图片的URL print("图片url为"+data) if "https://www.dxsabc.com/" not in data: data = 'http://www.xiaohuar.com'+data r2 = requests.get(data) fpath = os.path.join(FileName,deindex) with open(fpath+'.jpg','wb+')as f : # 循环写入图片 f.write(r2.content) index += 1 print('保存成功,快去查看图片吧!!') if __name__== '__main__': dd()
我要判断获取到的<a>标签内是否含有 'shtml'
result = string.find(a_data,shtml)!=-1
if result:
module 'string' has no attribute 'find'
3)can only concatenate str (not "int") to str
4) 以下是其中一个生成的文本截图,正文有了,但是调取接口却传不过去数据,导致校验返回为:缺少必填字段,
5) 在陆续解决一些小问题后;终于出点成绩
可惜的是在运行到第115的时候, 程序报错:
urllib3.exceptions.MaxRetryError: HTTPConnectionPool
6) urllib3.exceptions.MaxRetryError: HTTPConnectionPool
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | # coding:utf-8 import requests from bs4 import BeautifulSoup import os import checkwebsite import string # 创建一个文件夹名称 FilePath = 'I:\\test\\' FileName = 'xhtml' def dd(url): if not os.path.exists(os.path.join(os.getcwd(), FileName)): # 新建文件夹 os.mkdir(os.path.join(os.getcwd(),FileName)) print (u '建了一个名字叫做' , FileName, u '的文件夹!' ) else : print (u '名字叫做' , FileName, u '的文件夹已经存在了!' ) html = requests.get(url).content # 返回html # html = html.decode('utf-8') soup = BeautifulSoup(html, 'html.parser' ) # BeautifulSoup对象 # jpg_data = soup.find_all(r'href="(*.+?\.shtml)"') # 找到shtml a_datalist = soup.find_all( 'a' ) # 找到a标签 index = 1 datalist = [] for a_data in a_datalist: result = a_data.get( 'href' ) if not (result is None ): # str1=Hello.python # print str1[:str1.index(str2)] #获取 "."之前的字符(不包含点) 结果 Hello # print str1[str1.index(str2):] ; #获取 "."之后的字符(包含点) 结果.python if "http://www.hainan.gov.cn/" not in result: if "http://www" not in result: result = 'http://www.hainan.gov.cn' + result datalist.append(result) return datalist # print(index) # index += 1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | # coding:utf-8 import requests from bs4 import BeautifulSoup import codecs import urllib from html5lib import HTMLParser import os def checkTitle(soup): divtitle = soup.find_all( 'ucaptitle' ) strtitle = str (divtitle) if 'ucaptitle' in strtitle: strtitle = strtitle[:strtitle.index( "</ucaptitle>" )] strtitle = strtitle[:: - 1 ] strtitle = strtitle[:strtitle.index( ">eltitpacu" )] strtitle = strtitle[:: - 1 ] # print(strtitle) return str (strtitle) else : return '' def checkContent(soup): uuuu = '' divlist = soup.find_all( 'ucapcontent' ) for pfont in divlist: #遍历div的所有属性以及其值 p = pfont.find_all( 'p' ) for pp in p: start_pp = str (pp) p_start_pp = start_pp[:start_pp.index( "</p>" )] p_start_pp = p_start_pp[p_start_pp.index( '>' ):] result = p_start_pp[:: - 1 ] result = result[:result.index( ">" )] result = result[:: - 1 ] #w 只能操作写入 r 只能读取 a 向文件追加 #w+ 可读可写 r+可读可写 a+可读可追加 #wb+写入进制数据 #w模式打开文件,如果而文件中有数据,再次写入内容,会把原来的覆盖掉 uuuu = uuuu + str (result + '\n' ) return uuuu # return result def writeTxt(result): f = codecs. open ( 'I:\\data.txt' , 'a' , 'utf-8' ) f.write( str (result)) f.close() if __name__ = = '__main__' : sHtmlUrl = "http://www.hainan.gov.cn/hainan/hngs/201906/c0a08f5b5a7e42b2bab66212de76b050.shtml" html = requests.get(sHtmlUrl).content soup = BeautifulSoup(html, 'html.parser' ) # writeTxt(checkTitle(soup)) writeTxt(checkContent(soup)) ## 这个执行文件,是为了把所有的目标文章都爬进本地的txt文件里。 ## 待下一个文件,去读取本地文章,再去调取远程post服务,把摘要写进去到txt文件,只需要分析txt文件即可 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | import codecs import requests import json import checkwebsite import time fileUrl = 'I:\\data.txt' def two(fileUrl): num = 0 f = codecs. open (fileUrl, 'r' , 'utf-8' ) string = '' for l in f: tup = l.rstrip( '\n' ).rstrip() # print(tup) string + = tup num = num + 1 # print (str) # post 请求 url = 'http://localhost:8080/documentNew/parserNew' s = json.dumps({ 'content' : string, 'keywordCount' : '5' , 'summarySize' : '2' }) try : r = requests.post(url, s) except : print ( u '[%s] HTTP请求失败!!!正在准备重发。。。' ) time.sleep( 2 ) r = requests.post(url, s) resultStr = str (r.text) r.close() return resultStr if __name__ = = '__main__' : checkwebsite.writeTxt(two(fileUrl)) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | import checkshtml import checkwebsite import readFIleAndPost import requests import os import shutil from bs4 import BeautifulSoup import codecs url = 'http://www.hainan.gov.cn/hainan/' filePath = 'I:\\test\shtml\\' updateFilePath = 'I:\\test\\xhtml\\' def downLoadFile(url): # 先拿取 目标网站的 href属性 对象数组 resultlist = checkshtml.dd(url) shtmllist = [] # 取出 所需的shtml 对象数组 for list in resultlist: if "shtml" in list : shtmllist.append( list ) index = 1 #循环目标对象数组,分别取里面所需要的文本信息 for shtml in shtmllist: # shtml = "http://www.hainan.gov.cn/hainan/hngs/201906/c0a08f5b5a7e42b2bab66212de76b050.shtml" # shtml = "http://www.hainan.gov.cn/hainan/index.shtml" html = requests.get(shtml).content soup = BeautifulSoup(html, 'html.parser' ) urlpath = filePath + 'shtml' + str (index) + '.txt' updateurl = updateFilePath + 'shtml' + str (index) + '.txt' # 拿到文本对应的 标题和正文信息 strtitle = checkwebsite.checkTitle(soup).strip() strcontent = checkwebsite.checkContent(soup) if not (strcontent = = '') : f1 = codecs. open (urlpath, 'w' , 'utf-8' ) f2 = codecs. open (updateurl, 'w' , 'utf-8' ) # 新建并打开一个文本,写入标题 f1.write(strtitle + '\n' ) f2.write(strtitle + '\n' ) print ( "源文件路径>>>>>>" + urlpath + ">>>>>>标题:" + strtitle) print ( "修改文件路径>>>>>>" + updateurl + ">>>>>>标题:" + strtitle) # 循环调取 checkwebsite.checkContent() 写入正文。 f1.write(strcontent) f2.write(strcontent) # 关闭文件流 index + = 1 f1.close() f2.close() # 开启下一个链接的处理 def updateFile(): file_name = os.listdir(updateFilePath) for file in file_name: #获取到 接口返回值 response = readFIleAndPost.two(updateFilePath + file ) f = codecs. open (updateFilePath + file , 'a' , 'utf-8' ) f.write(response) f.close() def deleteAndCopyFile(): del_file(updateFilePath) f_list = os.listdir(filePath) n = 0 for fileNAME in f_list: n + = 1 oldname = filePath + fileNAME newname = updateFilePath + fileNAME shutil.copyfile(oldname, newname) # print(str(n)+'.'+'已复制'+fileNAME) def del_file(path): for i in os.listdir(path): path_file = os.path.join(path,i) if os.path.isfile(path_file): os.remove(path_file) else : del_file(path_file) if __name__ = = '__main__' : # 下载 2份文档到本地,运行一次,即可注释 # downLoadFile(url) # 直接修改初始化的其中一份文本 # updateFile() # 删除已有的样本,取源文件,先copy一份,再发post filePath源路径 deleteAndCopyFile() updateFile() |
[{'name': '张三', 'phone': '185185', 'wechat': '6546231'}, {'name': '李四', 'phone': '187169', 'wechat': 'asdsad'}]
1 2 3 4 5 | for studeht in self .studehts : #错误 name = studeht .get[ "name" ] #正确取法 name = studeht[ "name" ] |
