python的一些常用操作

  • 判断文件或者文件夹是否存在
if(os.path.exists(rootdir) == False)
  • 创建文件夹
os.mkdir(rootdir)
  • 调用系统命令
os.system(cmd)
  • 字典循环
for key,value in dict.items()
  • 打开文件并读取内容进行处理
fd = open('xxxx.txt', encoding='utf-8')
for line in fd:
     print line
fd.close()
  • 创建文件并写入内容
fd = open('xxxx.txt', 'a+', encoding='utf-8')
fd.write('aaaaa' + '\n')
fd.close()
  • 使用xlrd读取EXCEL
导入
import xlrd
打开excel
data = xlrd.open_workbook('demo.xls') #注意这里的workbook首字母是小写
查看文件中包含sheet的名称
data.sheet_names()
得到第一个工作表,或者通过索引顺序 或 工作表名称
table = data.sheets()[0]
table = data.sheet_by_index(0)
table = data.sheet_by_name(u'Sheet1')
获取行数和列数
nrows = table.nrows
ncols = table.ncols
获取整行和整列的值(数组)
table.row_values(i)
table.col_values(i)
循环行,得到索引的列表
for rownum in range(table.nrows):
print table.row_values(rownum)
单元格
cell_A1 = table.cell(0,0).value
cell_C4 = table.cell(2,3).value
分别使用行列索引
cell_A1 = table.row(0)[0].value
cell_A2 = table.col(1)[0].value
简单的写入
row = 0
col = 0
ctype = 1 # 类型 0 empty,1 string, 2 number, 3 date, 4 boolean, 5 error
value = 'lixiaoluo'
xf = 0 # 扩展的格式化 (默认是0)
table.put_cell(row, col, ctype, value, xf)
table.cell(0,0) # 文本:u'lixiaoluo'
table.cell(0,0).value # 'lixiaoluo'
  • 使用xlwt写入EXCEL
导入xlwt
 
import xlwt
 
新建一个excel文件
 
file = xlwt.Workbook() #注意这里的Workbook首字母是大写,无语吧
 
新建一个sheet
 
table = file.add_sheet('sheet name')
 
写入数据table.write(行,列,value)
 
table.write(0,0,'test')
 
如果对一个单元格重复操作,会引发
 
returns error:
# Exception: Attempt to overwrite cell:
# sheetname=u'sheet 1' rowx=0 colx=0
 
所以在打开时加cell_overwrite_ok=True解决
 
table = file.add_sheet('sheet name',cell_overwrite_ok=True)
 
保存文件
 
file.save('demo.xls')
 
另外,使用style
 
style = xlwt.XFStyle() #初始化样式
 
font = xlwt.Font() #为样式创建字体
 
font.name = 'Times New Roman'
 
font.bold = True
 
style.font = font #为样式设置字体
 
table.write(0, 0, 'some bold Times text', style) # 使用样式
  • 命令行getopt
try:
     options,args = getopt.getopt(sys.argv[1:],"hp:i:",["help","ip=","port="])
except getopt.GetoptError:
     sys.exit()
for name,value in options:
     if name in ("-h","--help"):
          usage()
     if name in ("-i","--ip"):
          print(value)
     if name in ("-p","--port"):
          print(value)
  • 简单爬虫
import requests
AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
HEADERS = {
        'User-Agent': AGENT,
        'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
        'X-Requested-With':'XMLHttpRequest',
        'Accept':'*/*'
session = requests.session()
 
#模拟登录
postdata = {
    'defaults':'xxx',
    'fromLogin':'xxx',
    'userName':'xxx',
    'password':'xxxx'
}
url = 'xxxxxxxx'
login_info = session.post(url, headers = HEADERS, data = postdata,verify = False)
if(login_info.status_code == requests.codes.ok):
    print('login success')
    return True
else:
    print('login err')
    return False
}
 
#下载html页面
def downloadUrl(rootdir, url, orgid, page):
    html = session.get(url, headers=global_config.HEADERS, verify=False)
    if(html.text[1:7] == 'script'):
        print(html.text)
        return "err"
    if(len(html.text) < 60):
        return "err"
    sample = open(rootdir + "/" + str(orgid) + '_' + str(page) + ".html", "w", encoding='utf-8')
    sample.write(html.text)
    sample.close()
    return 'ok'
 
  • 解析JOSN文件内容
def scrapy_by_file(json_file_name):
    #读取JSON文件的内容
    text = open(json_file_name, encoding='utf-8').read()
    #特殊处理,去除从WINDOWS系统带过来的BOM特殊字符
    if text.startswith(u'\ufeff'):
        text = text.encode('utf8')[3:].decode('utf8')
    #将文本内容的JSON数据转换成自定义的JSON对象
    try:
        json_data = json.loads(text)
    except:
        print(json_file_name)
        return
    for row in json_data['rows']:
 
def scrapy_by_row(row):
    try:
        orgid = row['organization']['id']
        familyid = row['censusRegisterFamily']['id']
    except:
        print('errrr')
        return
        scrapy_by_row(row)
  • 遍历文件夹
#遍历目录(rootdir) 遍历到的每个文件都执行dirFunc
def waklThroughDir(rootdir, dirFunc):
    for parent, dirnames, filenames in os.walk(rootdir):
        for filename in filenames:
            print(filename)
            #获取后缀为txt的文件
            if(filename.split('.')[-1] == 'html'):
                dirFunc(os.path.join(parent, filename))
  • 采集温州房产网基本信息
# -*- coding: utf-8 -*-
import re
import requests
import time
 
#-----------------------------用于解析的正则表达式常量------------------------------------------------------------------
#解析页数
PAGE_NUM = '共找到 (.*?) 符合条件的记录'
#解析小区名称
NAME = 'texttext_title"><ahref(.*?)</a></div><divclass="texttext_moreinfo">'
#解析小区价格
PRICE = 'class="hot_price">(.*?)</span>'
#解析小区地址
ADDRESS = 'text_moreinfo">(.*?)</div><divclass="texttext_moreinfo"><span>'
#文件生成路径
ROOTDIR = 'F:\\test\\'
 
#-----------------------------模拟请求的头部信息,否则将被识别出是程序抓包而被拦截--------------------------------------
HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, sdch',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
    'Host': 'www.0577home.net',
    'Upgrade-Insecure-Requests': '1'
}
 
#-----------------------------抓取某一页的房产信息,pageNo为页号--------------------------------------------------------
def getHouseListByPageno(pageNo):
    #建立一个连接用于后续发起请求
    session = requests.session()
    url = 'http://www.0577home.net/xiaoqu/list_0_0_0_0_0_0_0_' + str(pageNo) + '.html'
    houseList = session.get(url, headers = HEADERS, verify = False)
    #以写入模式打开文件
    fh = open(ROOTDIR + "houseList_pageNo" + str(pageNo) + ".txt",  'w' ,encoding='utf-8')
    #将movieList写入文件
    fh.write(houseList.text)
    #关闭文件
    fh.close()
 
#-------------------------------获取需要抓取的页面总数------------------------------------------------------------------
def getPageNum():
    #打开已经下载好的第一页房产内容
    f = open(ROOTDIR + 'houseList_pageNo1.txt', encoding='utf-8')
    #获取文件内容
    rawContent = f.read()
    #用正则表达式解析页面内容
    pageNum = re.findall(PAGE_NUM, rawContent)
    #返回页面号
    return int(pageNum[0]) / 20 + 1
 
def parseHouseListToFile(srcFile, dstFile):
    #打开待解析的文件
    f = open(srcFile, encoding='utf-8')
    #读取文件内容以备解析
    rawContent = f.read()
    p = re.compile('\s+')
    content = re.sub(p, '', rawContent)
    dnames = re.findall(NAME, content)
    names = []
    for dname in dnames:
        idx = dname.rfind('>')
        names.append(dname[idx + 1:])
    prices = re.findall(PRICE, content)
    daddress = re.findall(ADDRESS, content)
    address = []
    for daddr in daddress:
        id = daddr.rfind('>')
        address.append(daddr[id + 1:])
    i = 0
    for x in names:
        #写入时用'$'做分割,结尾加上回车符
        dstFile.write(names[i] + '$' + prices[i] + '$' + address[i] + '\n')
        i = i + 1
 
#-------------------------------主函数,下载并解析房产信息--------------------------------------------------------------
if __name__ == '__main__':
    #---------------------抓取页面-----------------------------
    #抓取第一页房产信息
    getHouseListByPageno(1)
    #通过第一页房产信息获取总共要抓取的页面数量
    pageNum = getPageNum()
    #抓取剩余的页面
    for i in range(2, int(pageNum) + 1):
        getHouseListByPageno(str(i))
    #---------------------解析页面-----------------------------
    #获取当前年月日
    localtime = time.strftime('%Y%m%d', time.localtime(time.time()))
    #创建一个文件,文件名前面带上年月日
    f = open(ROOTDIR + localtime + '_houseList.txt', 'a+', encoding='utf-8')
    #解析所有的页面
    #for k in range(1, int(pageNum) + 1):
    for k in range(1, 115):
        parseHouseListToFile(ROOTDIR + "houseList_pageNo" + str(k) + ".txt", f)
    #关闭文件
    f.close()
  • 采集温州房产网详细信息
# -*- coding: utf-8 -*-
import re
import requests
import time
import os
 
#-----------------------------用于解析的正则表达式常量------------------------------------------------------------------
#解析页数
PAGE_NUM = '共找到 (.*?) 符合条件的记录'
#解析小区名称
NAME = 'texttext_title"><ahref(.*?)</a></div><divclass="texttext_moreinfo">'
#解析小区价格
PRICE = 'class="hot_price">(.*?)</span>'
#解析小区地址
ADDRESS = 'text_moreinfo">(.*?)</div><divclass="texttext_moreinfo"><span>'
#解析小区编号
ID = 'class="picdiv_left"><ahref="http://www.0577home.net/xiaoqu/(.*?).html'
#解析小区所属区域
LOCATION = '<div><a>所属区域:</a><span>(.*?)</span></div>'
#解析小区占地面积
AREA = '<div><a>占地面积:</a><span>(.*?)</span></div>'
#解析小区绿化率
GREENINGRATE = '<div><a>绿化率:</a><span>(.*?)</span></div>'
#解析小区楼总数
LAYER = '<div><a>楼总数:</a><span>(.*?)</span></div>'
#解析小区物业类型
TYPE = '<div><a>物业类型:</a><span>(.*?)</span></div>'
#解析小区所属小学
PRIMARYSCHOOL = '<div><a>所属小学:</a><span>(.*?)</span></div>'
#解析小区总建筑面积
BUILDINGAREA = '<div><a>总建筑面积:</a><span>(.*?)</span></div>'
#解析小区容积率
PLOTRATIO = '<div><a>容积率:</a><span>(.*?)</span></div>'
#解析小区开发商
DEVEPLOPER = '<div><a>开发商:</a><span>(.*?)</span></div>'
#文件生成路径
ROOTDIR = 'F:\\test\\'
 
#-----------------------------模拟请求的头部信息,否则将被识别出是程序抓包而被拦截--------------------------------------
HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, sdch',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
    'Host': 'www.0577home.net',
    'Upgrade-Insecure-Requests': '1'
}
 
#-----------------------------抓取某一页的房产信息,pageNo为页号--------------------------------------------------------
def getHouseListByPageno(pageNo):
    #建立一个连接用于后续发起请求
    session = requests.session()
    url = 'http://www.0577home.net/xiaoqu/list_0_0_0_0_0_0_0_' + str(pageNo) + '.html'
    houseList = session.get(url, headers = HEADERS, verify = False)
    #以写入模式打开文件
    fh = open(ROOTDIR + "houseList_pageNo" + str(pageNo) + ".txt",  'w' ,encoding='utf-8')
    #将movieList写入文件
    fh.write(houseList.text)
    #关闭文件
    fh.close()
 
def getHouseInfoByPageno(pageNo, k):
    if(os.path.exists(ROOTDIR + "houseInfo_pageNo" + str(pageNo) + ".html")):
        return
    print('downloading !, count %s, page %s' % (str(k), str(pageNo)))
    #建立一个连接用于后续发起请求
    session = requests.session()
    url = 'http://www.0577home.net/xiaoqu/detail_' + str(pageNo) + '.html'
    houseList = session.get(url, headers = HEADERS, verify = False)
 
    #以写入模式打开文件
    fh = open(ROOTDIR + "houseInfo_pageNo" + str(pageNo) + ".html",  'w' ,encoding='utf-8')
    #将movieList写入文件
    fh.write(houseList.text)
    #关闭文件
    fh.close()
 
#-------------------------------获取需要抓取的页面总数------------------------------------------------------------------
def getPageNum():
    #打开已经下载好的第一页房产内容
    f = open(ROOTDIR + 'houseList_pageNo1.txt', encoding='utf-8')
    #获取文件内容
    rawContent = f.read()
    #用正则表达式解析页面内容
    pageNum = re.findall(PAGE_NUM, rawContent)
    #返回页面号
    return int(pageNum[0]) / 20 + 1
 
def parseHouseInfo(srcFile):
    #打开待解析的文件
    f = open(srcFile, encoding='utf-8')
    #读取文件内容以备解析
    content = f.read()
    # p = re.compile('\s+')
    # content = re.sub(p, '', rawContent)
    location = re.findall(LOCATION, content)[0]
    location = location.split(' ')
    category1 = location[0]
    category2 = location[1]
    area = re.findall(AREA, content)[0]
    greeningrate = re.findall(GREENINGRATE, content)[0]
    layer = re.findall(LAYER, content)[0]
    type = re.findall(TYPE, content)[0]
    primaryschool = re.findall(PRIMARYSCHOOL, content)[0]
    buildingarea = re.findall(BUILDINGAREA, content)[0]
    plotratio = re.findall(PLOTRATIO, content)[0]
    developer = re.findall(DEVEPLOPER, content)[0]
    f.close()
    return (category1, category2, area, greeningrate, layer, type, primaryschool, buildingarea, plotratio, developer)
 
def parseHouseListToFile(srcFile, dstFile):
    #打开待解析的文件
    f = open(srcFile, encoding='utf-8')
    #读取文件内容以备解析
    rawContent = f.read()
    p = re.compile('\s+')
    content = re.sub(p, '', rawContent)
    dnames = re.findall(NAME, content)
    names = []
    for dname in dnames:
        idx = dname.rfind('>')
        names.append(dname[idx + 1:])
    prices = re.findall(PRICE, content)
    daddress = re.findall(ADDRESS, content)
    ids = re.findall(ID, content)
    address = []
    for daddr in daddress:
        id = daddr.rfind('>')
        address.append(daddr[id + 1:])
    i = 0
    f.close()
    for x in names:
        #写入时用'$'做分割,结尾加上回车符
        dstFile.write(names[i] + '$' + prices[i] + '$' + address[i] + '$' + ids[i] + '\n')
        i = i + 1
 
#-------------------------------主函数,下载并解析房产信息--------------------------------------------------------------
if __name__ == '__main__':
    #---------------------抓取页面-----------------------------
    #抓取第一页房产信息
    # getHouseListByPageno(1)
    # #通过第一页房产信息获取总共要抓取的页面数量
    # pageNum = getPageNum()
    # #抓取剩余的页面
    # for i in range(2, int(pageNum) + 1):
    #    getHouseListByPageno(str(i))
    #---------------------解析页面-----------------------------
    #获取当前年月日
    localtime = time.strftime('%Y%m%d', time.localtime(time.time()))
    #创建一个文件,文件名前面带上年月日
    f = open(ROOTDIR + localtime + '_houseList.txt', 'a+', encoding='utf-8')
    #解析所有的页面
    #for k in range(1, int(pageNum) + 1):
    for k in range(1, 115):
        parseHouseListToFile(ROOTDIR + "houseList_pageNo" + str(k) + ".txt", f)
    #关闭文件
    f.close()
    f = open(ROOTDIR + localtime + '_houseList.txt', encoding='utf-8')
    fd = open(ROOTDIR + localtime + '_houseInfo.txt', 'w', encoding='utf-8')
    k = 0
    for line in f:
        data = line.strip('\n')
        data = data.split('$')
        idx = data[3]
        getHouseInfoByPageno(idx, k)
        houseInfo = parseHouseInfo(ROOTDIR + "houseInfo_pageNo" + str(idx) + ".html")
        print(str(k) + "$".join(data) + '$' + "$".join(houseInfo))
        fd.write("$".join(data) + '$' + "$".join(houseInfo) + '\n')
        k += 1
    f.close()
    fd.close()
  • 读取csv文件
with open('job.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
          print(row)
  • 写入csv文件
#创建CSV文件并写入第一行
def createCsv(file):
    if not os.path.exists(file):
        csvfile = open(file, 'a+', encoding='utf-8', newline='')
        writer = csv.writer(csvfile)
        writer.writerow(paramname)
    else:
        csvfile = open(file, 'a+', newline='')
        writer = csv.writer(csvfile)
    return writer
  • python调用JAVA
import sys
import jpype
 
name = sys.argv[1]
jarpath = '/home/dsadm/why/python'
jpype.startJVM(jpype.getDefaultJVMPath(), "-Djava.ext.dirs=%s" % jarpath)
DECRYPT = jpype.JClass('why.fmrt.decrypt.DECRYPT')
upperName =DECRYPT.decrypt(name)
print(upperName)
jpype.shutdownJVM()
  • 简单验证码破解
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
import subprocess
import requests
from PIL import Image
from PIL import ImageOps
 
def cleanImage(imagePath):
    image = Image.open(imagePath)
    image = image.point(lambda x: 0 if x<143 else 255)
    borderImage = ImageOps.expand(image,border=20,fill='white')
    borderImage.save(imagePath)
 
html = urlopen("http://www.pythonscraping.com/humans-only")
bsObj = BeautifulSoup(html, "html.parser")
#Gather prepopulated form values
imageLocation = bsObj.find("img", {"title": "Image CAPTCHA"})["src"]
formBuildId = bsObj.find("input", {"name":"form_build_id"})["value"]
captchaSid = bsObj.find("input", {"name":"captcha_sid"})["value"]
captchaToken = bsObj.find("input", {"name":"captcha_token"})["value"]
 
captchaUrl = "http://pythonscraping.com"+imageLocation
urlretrieve(captchaUrl, "captcha.jpg")
cleanImage("captcha.jpg")
p = subprocess.Popen(["tesseract", "captcha.jpg", "captcha"], stdout=
    subprocess.PIPE,stderr=subprocess.PIPE)
p.wait()
f = open("captcha.txt", "r")
 
#Clean any whitespace characters
captchaResponse = f.read().replace(" ", "").replace("\n", "")
print("Captcha solution attempt: "+captchaResponse)
 
if len(captchaResponse) == 5:
    params = {"captcha_token":captchaToken, "captcha_sid":captchaSid,
              "form_id":"comment_node_page_form", "form_build_id": formBuildId,
                  "captcha_response":captchaResponse, "name":"Ryan Mitchell",
                  "subject": "I come to seek the Grail",
                  "comment_body[und][0][value]":
                                          "...and I am definitely not a bot"}
    r = requests.post("http://www.pythonscraping.com/comment/reply/10",
                          data=params)
    responseObj = BeautifulSoup(r.text)
    if responseObj.find("div", {"class":"messages"}) is not None:
        print(responseObj.find("div", {"class":"messages"}).get_text())
else:
    print("There was a problem reading the CAPTCHA correctly!")

  • 滑块验证码破解
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
import PIL.Image as image
import time,re, random
import requests
try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO
 
#爬虫模拟的浏览器头部信息
agent = 'Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0'
headers = {
        'User-Agent': agent
        }
 
# 根据位置对图片进行合并还原
# filename:图片
# location_list:图片位置
#内部两个图片处理函数的介绍
#crop函数带的参数为(起始点的横坐标,起始点的纵坐标,宽度,高度)
#paste函数的参数为(需要修改的图片,粘贴的起始点的横坐标,粘贴的起始点的纵坐标)
def get_merge_image(filename,location_list):
    #打开图片文件
    im = image.open(filename)
    #创建新的图片,大小为260*116
    new_im = image.new('RGB', (260,116))
    im_list_upper=[]
    im_list_down=[]
    # 拷贝图片
    for location in location_list:
        #上面的图片
        if location['y']==-58:
            im_list_upper.append(im.crop((abs(location['x']),58,abs(location['x'])+10,166)))
        #下面的图片
        if location['y']==0:
            im_list_down.append(im.crop((abs(location['x']),0,abs(location['x'])+10,58)))
    new_im = image.new('RGB', (260,116))
    x_offset = 0
    #黏贴图片
    for im in im_list_upper:
        new_im.paste(im, (x_offset,0))
        x_offset += im.size[0]
    x_offset = 0
    for im in im_list_down:
        new_im.paste(im, (x_offset,58))
        x_offset += im.size[0]
    return new_im
 
#下载并还原图片
# driver:webdriver
# div:图片的div
def get_image(driver,div):
    #找到图片所在的div
    background_images=driver.find_elements_by_xpath(div)
    location_list=[]
    imageurl=''
    #图片是被CSS按照位移的方式打乱的,我们需要找出这些位移,为后续还原做好准备
    for background_image in background_images:
        location={}
        #在html里面解析出小图片的url地址,还有长高的数值
        location['x']=int(re.findall("background-image: url\(\"(.*)\"\); background-position: (.*)px (.*)px;",background_image.get_attribute('style'))[0][1])
        location['y']=int(re.findall("background-image: url\(\"(.*)\"\); background-position: (.*)px (.*)px;",background_image.get_attribute('style'))[0][2])
        imageurl=re.findall("background-image: url\(\"(.*)\"\); background-position: (.*)px (.*)px;",background_image.get_attribute('style'))[0][0]
        location_list.append(location)
    #替换图片的后缀,获得图片的URL
    imageurl=imageurl.replace("webp","jpg")
    #获得图片的名字
    imageName = imageurl.split('/')[-1]
    #获得图片
    session = requests.session()
    r = session.get(imageurl, headers = headers, verify = False)
    #下载图片
    with open(imageName, 'wb') as f:
        f.write(r.content)
        f.close()
    #重新合并还原图片
    image=get_merge_image(imageName, location_list)
    return image
 
#对比RGB值
def is_similar(image1,image2,x,y):
    pass
    #获取指定位置的RGB值
    pixel1=image1.getpixel((x,y))
    pixel2=image2.getpixel((x,y))
    for i in range(0,3):
        # 如果相差超过50则就认为找到了缺口的位置
        if abs(pixel1[i]-pixel2[i])>=50:
            return False
    return True
 
#计算缺口的位置
def get_diff_location(image1,image2):
    i=0
    # 两张原始图的大小都是相同的260*116
    # 那就通过两个for循环依次对比每个像素点的RGB值
    # 如果相差超过50则就认为找到了缺口的位置
    for i in range(0,260):
        for j in range(0,116):
            if is_similar(image1,image2,i,j)==False:
                return  i
 
#根据缺口的位置模拟x轴移动的轨迹
def get_track(length):
    pass
    list=[]
    #间隔通过随机范围函数来获得,每次移动一步或者两步
    x=random.randint(1,3)
    #生成轨迹并保存到list内
    while length-x>=5:
        list.append(x)
        length=length-x
        x=random.randint(1,3)
    #最后五步都是一步步移动
    for i in range(length):
        list.append(1)
    return list
 
#滑动验证码破解程序
def main():
    #打开火狐浏览器
    driver = webdriver.Firefox()
    #用火狐浏览器打开网页
    driver.get("http://www.geetest.com/exp_embed")
    #等待页面的上元素刷新出来
    WebDriverWait(driver, 30).until(lambda the_driver: the_driver.find_element_by_xpath("//div[@class='gt_slider_knob gt_show']").is_displayed())
    WebDriverWait(driver, 30).until(lambda the_driver: the_driver.find_element_by_xpath("//div[@class='gt_cut_bg gt_show']").is_displayed())
    WebDriverWait(driver, 30).until(lambda the_driver: the_driver.find_element_by_xpath("//div[@class='gt_cut_fullbg gt_show']").is_displayed())
    #下载图片
    image1=get_image(driver, "//div[@class='gt_cut_bg gt_show']/div")
    image2=get_image(driver, "//div[@class='gt_cut_fullbg gt_show']/div")
    #计算缺口位置
    loc=get_diff_location(image1, image2)
    #生成x的移动轨迹点
    track_list=get_track(loc)
    #找到滑动的圆球
    element=driver.find_element_by_xpath("//div[@class='gt_slider_knob gt_show']")
    location=element.location
    #获得滑动圆球的高度
    y=location['y']
    #鼠标点击元素并按住不放
    print ("第一步,点击元素")
    ActionChains(driver).click_and_hold(on_element=element).perform()
    time.sleep(0.15)
    print ("第二步,拖动元素")
    track_string = ""
    for track in track_list:
        #不能移动太快,否则会被认为是程序执行
        track_string = track_string + "{%d,%d}," % (track, y - 445)
        #xoffset=track+22:这里的移动位置的值是相对于滑动圆球左上角的相对值,而轨迹变量里的是圆球的中心点,所以要加上圆球长度的一半。
        #yoffset=y-445:这里也是一样的。不过要注意的是不同的浏览器渲染出来的结果是不一样的,要保证最终的计算后的值是22,也就是圆球高度的一半
        ActionChains(driver).move_to_element_with_offset(to_element=element, xoffset=track+22, yoffset=y-445).perform()
        #间隔时间也通过随机函数来获得,间隔不能太快,否则会被认为是程序执行
        time.sleep(random.randint(10,50)/100)
    print (track_string)
    #xoffset=21,本质就是向后退一格。这里退了5格是因为圆球的位置和滑动条的左边缘有5格的距离
    ActionChains(driver).move_to_element_with_offset(to_element=element, xoffset=21, yoffset=y-445).perform()
    time.sleep(0.1)
    ActionChains(driver).move_to_element_with_offset(to_element=element, xoffset=21, yoffset=y-445).perform()
    time.sleep(0.1)
    ActionChains(driver).move_to_element_with_offset(to_element=element, xoffset=21, yoffset=y-445).perform()
    time.sleep(0.1)
    ActionChains(driver).move_to_element_with_offset(to_element=element, xoffset=21, yoffset=y-445).perform()
    time.sleep(0.1)
    ActionChains(driver).move_to_element_with_offset(to_element=element, xoffset=21, yoffset=y-445).perform()
    print ("第三步,释放鼠标")
    #释放鼠标
    ActionChains(driver).release(on_element=element).perform()
    time.sleep(3)
    #点击验证
    # submit = driver.find_element_by_xpath("//div[@class='gt_ajax_tip success']")
    # print(submit.location)
    # time.sleep(5)
    #关闭浏览器,为了演示方便,暂时注释掉.
    #driver.quit()
 
#主函数入口
if __name__ == '__main__':
    pass
    main()

  • python构建web页面
import os
 
import tornado.httpserver
import tornado.ioloop
import tornado.options
import tornado.web
 
from view import *
 
from tornado.options import define, options
define("port", default=8000, help="run on the given port", type=int)
 
class Application(tornado.web.Application):
    def __init__(self):
        handlers = [
            (r"/", Indexhandler),
        ]
        settings = dict(
            template_path=os.path.join(os.path.dirname(__file__), 'templates'),
            autoescape=None,
            debug=False,
        )
        tornado.web.Application.__init__(self, handlers, **settings)
 
if __name__ == "__main__":
    tornado.options.parse_command_line()
    http_server = tornado.httpserver.HTTPServer(Application(), xheaders=True)
    http_server.listen(options.port)
    tornado.ioloop.IOLoop.instance().start()

  • 定时任务
#! /usr/bin/env python
# coding=utf-8
import time, os, sched
 
# 第一个参数确定任务的时间,返回从某个特定的时间到现在经历的秒数
# 第二个参数以某种人为的方式衡量时间
schedule = sched.scheduler(time.time, time.sleep)
 
def perform_command(cmd, inc):
    # 安排inc秒后再次运行自己,即周期运行
    schedule.enter(inc, 0, perform_command, (cmd, inc))
    os.system(cmd)
 
def timming_exe(cmd, inc=60):
    # enter用来安排某事件的发生时间,从现在起第n秒开始启动
    schedule.enter(inc, 0, perform_command, (cmd, inc))
    # 持续运行,直到计划时间队列变成空为止
    schedule.run()
 
#每隔一天调用getMovieList.py程序
timming_exe("getMovieList.py", 60 * 60 * 24)

  • 通过百度地图API,标准化地址
from urllib.request import urlopen
from urllib.parse import urlencode
from urllib.error import URLError
import json
 
class xBaiduMap:
    def __init__(self, key='mgf2Gxr7EgnfPVQnpClZnsug'):
        self.host = 'http://api.map.baidu.com'
        self.path = '/geocoder?'
        self.param = {'address': None, 'output': 'json', 'key': key, 'location': None, 'city': None}
 
    def getLocation(self, address, city=None):
        rlt = self.geocoding('address', address, city)
        if rlt != None:
            l = rlt['result']
            if isinstance(l, list):
                return None
            return l['location']['lat'], l['location']['lng']
 
    def getAddress(self, lat, lng):
        rlt = self.geocoding('location', "{0},{1}".format(lat, lng))
        if rlt != None:
            l = rlt['result']
            #return l['formatted_address']
            # Here you can get more details about the location with 'addressComponent' key
            ld=rlt['result']['addressComponent']
            return (ld['city']+';'+ld['district']+';'+ld['street']+";"+ld['street_number'])
 
    def geocoding(self, key, value, city=None):
        if key == 'location':
            if 'city' in self.param:
                del self.param['city']
            if 'address' in self.param:
                del self.param['address']
 
        elif key == 'address':
            if 'location' in self.param:
                del self.param['location']
            if city == None and 'city' in self.param:
                del self.param['city']
            else:
                self.param['city'] = city
        self.param[key] = value
        try:
            r = urlopen(self.host + self.path + urlencode(self.param)).read()
        except URLError:
            print ("URLError")
            return None
 
        str_response = r.decode('utf-8')
        rlt = json.loads(str_response)
        if rlt['status'] == 'OK':
            return rlt
        else:
            print ("Decoding Failed")
            return None
  • 多进程
import multiprocessing
for process_id in range(PROCESS_NUM):
    p = multiprocessing.Process(target=worker, args=(process_id,))
    jobs.append(p)
    p.start()

  • 文件切割小程序
def split_file(file_name, file_num):
    #文件已经存在
    if(os.path.exists("split_0.txt")):
        return
 
    #统计文件的总行数
    count = -1
    file = open(file_name, encoding='utf-8')
    for count, line in enumerate(file):
        pass
    count += 1
    file.close()
 
    #每个文件的行数
    count_per_file = count / file_num
 
    #创建file_num个新文件
    for i in range(file_num):
        file = open("split_" + str(i) + ".txt", 'w', encoding='utf-8')
        file.close()
 
    #分割成file_num个新文件
    file = open(file_name, encoding='utf-8')
    count = -1
    for count, line in enumerate(file):
        file_index = (int)(count /count_per_file)
        sub_file = open("split_" + str(file_index) + ".txt", "a+", encoding='utf-8')
        if(sub_file != None):
            sub_file.write(line)

  • python操作DB2
import ibm_db
con = ibm_db.connect("DATABASE=FMRT;HOSTNAME=XX.XX.XX.XX;PORT=60000;PORTOCOL=TCPIP;UID=db2inst1;PWD=db2inst1;", "", "")
sql = getSql(inputfile)
stmt = ibm_db.exec_immediate(con, sql)
result = ibm_db.fetch_both(stmt)
rowidx = 0
while (result):
    #DO SOMETHING
    result = ibm_db.fetch_both(stmt)
ibm_db.close(con)
  • jieba中文分词
import jieba
 
seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
for line in seg_list:
    print(line)
print("Full Mode: " + "/ ".join(seg_list))  # 全模式
 
seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("Default Mode: " + "/ ".join(seg_list))  # 精确模式
 
seg_list = jieba.cut("他来到了网易杭研大厦")  # 默认是精确模式
print(", ".join(seg_list))
 
seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造")  # 搜索引擎模式
print(", ".join(seg_list))
  • 月末判断
import calendar
import sys
 
def isMonthEnd(datetime):
    year = int(datetime[0:4])
    month = int(datetime[4:6])
    day = int(datetime[6:8])
    wday, monthrange = calendar.monthrange(year, month)
    if(day == monthrange):
        return 1
    else:
        return 0
 
isMonthEnd(sys.argv[1])

  • 移除中文分隔符
cmd = "sed ':a;N;$ s/\\r\\n//g;ba' " + oldfile + " > " + newfile
os.system(cmd)
 
  • 多线程
# -*- coding: utf-8 -*-
"""
    thread
    ~~~~~~~~~~~~~~~~
 
  Thread framework
 
    :copyright: (c) 2016 by why.
    :license: MIT, see LICENSE for more details.
"""
 
import threading
 
class Threadconfig():
    def __init__(self, thread_size):
        self.thread_size = thread_size
 
    def topen(self):
        self.thread_tasks = []
 
    def build(self, func, **kwargs):
        self.thread_task = threading.Thread(target=func, kwargs=(kwargs))
        self.thread_tasks.append(self.thread_task)
 
    def run(self):
        for thread_task in self.thread_tasks:
            thread_task.setDaemon(True)
            thread_task.start()
        while 1:
            alive = False
            for thread_num in range(0, self.thread_size):
                alive = alive or self.thread_tasks[thread_num].isAlive()
            if not alive:
                break
 
    def __del__(self):
        self.thread_tasks = []

 
  • python 安装wheel
     pip install *.wheel

posted on 2017-05-15 10:41  Kernel_wu  阅读(478)  评论(0编辑  收藏  举报

导航