笔记

python

注释

/#单行注释

‘’ 多汗注释
变量

没有声明变量的过程，个人理解，把自动识别所有类型
输入输出

print()和input()
数学运算

+ - * /
/	浮点除法
//  整除
**  幂运算
%   取模
> < >= <= != == ｛<>不等｝
not and or
赋值=

流程控制

if 条件表达式:

something

else:

something

if 条件表达式:

something

elif: #或者else if:

something

else:

something

n = 100

for i in range(n):

print(i)

while n > 0:

print(n)

n--

函数

def helle():

print("hello world")

主函数

def main():
	hello()
if __name__ == '__main__':
	main()

递归

def fact(n):
	if n <= 1:
		return 1
	return fact(n-1)*n

序列：列表，元组，字符串，buffer对象，xrange对象，unicode字符串

映射：字典
```
```python
```

列表长度可变，内容可不同

a = [1,2,3]
a.append(4)
a.sort()
a.pop()
a.remove("2")
for i in a:
print(i)
```

元组
```
#有序，不可改变 
a = ('data')
```

字典

x = {"tree":"12346",
	123:"123"}
x.get("tree","default") #获取元素，没有打印default
len(x)
name.pop() #删除

集合

a = set(['lucy','lily'])
支持 & | 运算

面向对象

class B:
    def __init__(self):
        print("B构造方法")

class A(B):
    
    num = 0 #静态变量
    
	def __init__(self,a):
        B.__init__(self)
        self.name = "hello"
        self.age = 20
      
    def output(self):
        print(self.age)
    @staticmethod
    def Doing():
        print("test")
        
    def __printf(self):#私有方法
        pass

异常处理

try
	捕捉异常
except Exception as e:
	print(e)
else:
    无异常
finally:
    总要执行

文件处理

f = open("文件路径","r") #返回文件对象
print(f.read())
f.close()

f = open("文件路径","r") #返回文件对象
x = f.readlines() #x为列表，填参数是字节，默认一行
for line in x:
    print(line)
f.close()

f = open("文件路径","w") #返回文件对象 # w为写
f.write("只能写字符串")
f.close()

自带 help

#安装三方库
#cmd 中 python -m pip install 库名
#help > moduoles 列出库

#随机模块
import random()
print(random.randint(1,10)) #随机数

#时间模块
import time
sec = time.time()  #同c time()

#sys模块
import os
sys.path
sys.modules
exit(0)
platform #返回平台标准
sys,argcv

#os模块
os.environ
os.system("cmd命令")
os.sep
pathsep
linesep

r"字符串" #字符串保持原意义处理"/"等转义

多线程

import threading
import time

def test(n):
    while n > 0:
        print(n);
        time.sleep(1)
        n -= 1

x = threading.Thread(target = test,args=(5,))
x.start()

#法二：类的继承
class A(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        self.stop__=False
        
    def __stop(self):
        self.stop__=True

zip

#打开zip文件
import zipfile

z_file = zipfile.ZipFile("test.zip","r") #zip对象
z_file.extractall(pwd="cwl".encode()) #pwd 是密码 encode二进制 提取文件

压缩包暴力密码破解

知道原理后简单不写了...

网络方面

#服务器
import socket

host = socket.gethostname()               #取得本机主机名
host_ip = socket.gethostbyname(host)      #取得本机ip地址
port = 45535      #端口

server_addr = (host_ip,port)

server = socket.socket()      #取得套接字对象用于收发数据
server.bind(server_addr)      #绑定地址到套接字对象

server.listen(5)
print("等待用户连接")

while 1:

      client,addr = server.accept()
      print("用户已连接...")
      client.send("欢迎连接".encode())

正则表达式入门

"." 匹配任意字符，\n除外

“*” 匹配前一字符0或无限次

“？” 匹配前一个字符0或1次

".*" 贪心算法匹配尽可能多

“.*？” 非贪心算法

（）作为结果返回

findall()	#匹配所有符合规则的内容，返回列表
search()	#匹配提取第一个符合的内容，返回正则对象
sub()		#替换符合规律的内容，返回替换后的值

例：

import re
#1
a = 'xy123'
b = re.findall('x...',a)
print b
# "."相当于占位符，一个点占一个字符

#2
a = 'xyxy123'
b = re.findall('x*',a)
print b
# "*"匹配前一字符若干次

#3
a = 'xy123'
b = re.findall('x?',a)
print b
# "？"

#重点（.*）
code = jklasd [xx asjkdlasl xx asds xx asdasdasdasd xx]#贪心匹配更多
#重点（.*？）
code = jklasd [xx asjkdlasl xx] asds [xx asdasdasdasd xx]#少量多次

(.*?)#括号内返回的内容

#匹配函数第三参数,特判
re.findall('',s,re.S) #re.S 匹配扩大到换行符号

findall与search

s = 'asdfxxIxx123xxlovexxdfd'
f1 = re.search('xx(.*?)xx123xx(.*?)xx',s2).group(2) #找到就不找了
print f1

f2 = re.findall('xx(.*?)xx123xx(.*?)xx',s2) #找多次
print f2[0][1];

sub的使用 ,可用于翻页

s = '123rrrrr123'
output = re.sub('123(.*?)123','123%d123'%789,s)
print output # 123789123

导入方式2

from re import findall,search,S
#这样可省略re.S,不推荐

匹配纯粹数字

a = 'asdfasf1234567fasd555fas'
b = re.findall('(\d+)',a)
print b

python爬虫笔记

# -*- coding: utf-8 -*-
import urllib.request
import html
f = urllib.request.urlopen("http://59.77.139.92")
html = f.read()
html = html.decode("utf-8")#将二进制转换为utf-8
print(html)

1.爬图片

小爬虫替我弄了张百度的图片到桌面

# -*- coding: utf-8 -*-
import urllib.request


res = urllib.request.urlopen('https://ss0.bdstatic.com/5aV1bjqh_Q23odCf/static/superman/img/logo/bd_logo1_31bdc765.png')
cat_img = res.read()

with open('baidu.png','wb') as f:
      f.write(cat_img)

一些其他的方法

response.geturl()	#得到访问地址
response.info()	    #HTTPmessage对象，可打印
response.getcode()  #得到http状态 200 == 正常

爬虫利用词典翻译

谷歌浏览器审查元素

找到Network的翻译消息，url和form data

不完全代码

import urllib.request
import urllib.parse

url = ''

data = {}

data = urllib.parse.urlencode(data).encode('utf-8')#unicode转utf-8

response = urllib.request.urlopen(url,data)
html = response.read().decode('utf-8')    #utf-8解码unicode

print(html)

爬虫隐藏

更改访问来源来隐藏,假装是用户访问

req = response = urllib.request.urlopen(url,data,head)#head是字典
or
req.add_header('User-Agent','')

一些服务器以ip提交次数反爬虫，解决方案，1. 延时处理。 2. 代理

#延时访问
import time
time.sleep(5)

搜索获得代理ip

import urllib.request

url = 'http://www.whatismyip.com.tw' #一个获取本机ip的网站

#参数是一个字典{'类型':'代理ip:端口号'}
proxy_support = urllib.request.ProxyHandler({'http':'45.76.110.143:8080'})

#定制一个opener
opener = urllib.request.build_opener(proxy_support)
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3226.400 QQBrowser/9.6.11682.400')]

#安装opener
urllib.request.install_opener(opener)

response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')

print(html)

#创建文件夹
import os
os.mkdir(folder) #进入目录
os.chdir(folder) #改变 


#爬虫

!/usr/bin/python

-- coding: UTF-8 --

import re
import requests

headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3226.400 QQBrowser/9.6.11682.400"}
html = requests.get("https://www.baidu.com/",headers = headers)

html = requests.get("https://www.baidu.com/")

html.encoding = 'utf-8'
print(html.text)

data = {}

html_post = requests.post(url,data=data)




```python
#coding=utf-8
import urllib
import re

def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    return html

def getImg(html):
    reg = r'src="(.+?\.jpg)" pic_ext'
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)
    x = 0
    for imgurl in imglist:
        urllib.urlretrieve(imgurl,'%s.jpg' % x)
        x+=1


 html = getHtml("http://tieba.baidu.com/p/2460150866")

print getImg(html)

posted @ 2017-08-05 13:19 Q1143316492 阅读(154) 评论(0) 编辑收藏举报

刷新页面返回顶部

Q1143316492

新博客 https://q1143316492.github.io/

笔记

python

列表长度可变，内容可不同

多线程

zip

网络方面

正则表达式入门

python爬虫笔记

1.爬图片

小爬虫替我弄了张百度的图片到桌面

爬虫利用词典翻译

!/usr/bin/python

-- coding: UTF-8 --

html = requests.get("https://www.baidu.com/")

data = {}

html_post = requests.post(url,data=data)

公告

Q1143316492

新博客 https://q1143316492.github.io/

笔记

python

列表 长度可变，内容可不同

多线程

zip

网络方面

正则表达式入门

python爬虫笔记

1.爬图片

小爬虫替我弄了张百度的图片到桌面

爬虫利用词典翻译

!/usr/bin/python

-- coding: UTF-8 --

html = requests.get("https://www.baidu.com/")

data = {}

html_post = requests.post(url,data=data)

公告

列表长度可变，内容可不同