如何使用多线程?

需求:
https://intrinio.com/tutorial/web_api
我们通过上述网站提供的API获取了中国股市某支股票csv数据文件,现在要下载多只股票的csv数据,并将其转换为xml文件,如何使用线程来提高下载并处理的效率?
思路:通过python中的thread来下载
代码:

import csv
from xml.etree.ElemenTree import Element, ElementTree
import requests
from StringIO import StringIO
from xml_pretty import pretty

def download(url):
    response = requests.get(url,timeout=3)
    if response.ok:
        return StringIO(response.content)

def csvToxml(scsv,fxml):
    reader = csv.reader(scsv)
    header = reader.next()
    headers = map(lambda h: h.replace( , ),headers)

    root = Element("Data")
    for row in reader:
        eRow = Element("Row")
        root.append(eRow)
        for tag,text in zip(headers,row):
            e = Element(tag)
            e.text = text
            eRow.append(e)

    pretty(root)
    et = ElementTree(root)
    et.write(fxml)

def handle(sid):
    print('Download...(%d)' % sid)
    url = 'http://table.finance.yahoo.com/table.csv?s=%s.sz'
    url %= str(sid).rjust(6,'0')
    rf = download(url)
    if rf is None:return

    print('convert to xml...(%d)' % sid)
    fname = str(sid).rjust(6,'0') +'.xml'
    with open(fname,'wb') as wf:
        csvToxml(rf,wf)

# 方法一
from threading import Thread
t = Thread(target=handle,args=(1,)) # 创建一个线程对象,并处理第一支股票
t.start # 执行线程

# 方法二
class MyThread(Thread):
    def __init__(self,sid):
       Thread.__init__(self) # 调用父类的构造器
       self.sid = sid
   
   def run(self):
       handle(self.sid)

threads = []
for i in xrange(1,11):
    t = MyThread(i)
    threads.append(t)
    t.start()

for t in threads:
    t.join() # 阻塞函数等待子线程的退出,如果run函数没有执行完主线程函数不会退出,即下面没有打印
print('main thread')

# io型操作,相当于超市订货,例如上面的download操作
# cpu型操作,相当于超市货物搬运,例如csv转换xml文件
# 在python中不适合于用cpu密集型操作,原因是global interpreter lock,全局解释器锁,python中的线程只适合处理io型的操作

if __name__ == '__main__':
    url = 'http://table.finance.yahoo.com/table.csv?s=000001.sz'
    rf = download(url)
    if rf:
        with open('000001.xml',wb) as wf:
            csvToxml(rf,wf)

=================================================================

import requests
import base64
from io import StringIO
import csv
from xml.etree.ElementTree import ElementTree, Element, SubElement

apikey = 'OjZlY2MzYTQwNGVlMTI3Y2VkYjMyYTZiNzJiYzdlOTFk'

def download_csv(page_number):   # 下载文件是IO密集型的操作,使用线程可以提速,效果显著
    print('download csv data [page=%s]' % page_number)
    url = "https://api.intrinio.com/prices.csv?api_key=OjZlY2MzYTQwNGVlMTI3Y2VkYjMyYTZiNzJiYzdlOTFk&identifier=AAPL&page_size=20&page_number=%s&start_date=2017-09-28&end_date=2020-09-28" % page_number
    # auth = b'Basic ' + base64.b64encode(b'%s' % api_key)
    # headers = {'Authorization' : auth}
    response = requests.get(url)

    if response.ok:
        return StringIO(response.text)    # 直接将数据保存到内存中,不保存在磁盘上。

def csv_to_xml(csv_file, xml_path):  # 此操作是CPU密集型的操作,需要四个CPU同时进行处理,就可以进行提速。但是在python中的多线程做不到。效果不明显。
    print('Convert csv data to %s' % xml_path)
    reader = csv.reader(csv_file)
    headers = next(reader)

    root = Element('Data')
    root.text = '\n\t'
    root.tail = '\n'

    for row in reader:
        book = SubElement(root, 'Row')
        book.text = '\n\t\t'
        book.tail = '\n\t'

        for tag, text in zip(headers, row):
            e = SubElement(book, tag)
            e.text = text
            e.tail = '\n\t\t'
        e.tail = '\n\t'

    ElementTree(root).write(xml_path, encoding='utf8')

def download_and_save(page_number, xml_path):
    # IO
    csv_file = None
    while not csv_file:
        csv_file = download_csv(page_number)
    # CPU
    csv_to_xml(csv_file, 'data%s.xml' % page_number)

from threading import Thread
class MyThread(Thread):
    def __init__(self, page_number, xml_path):
        super().__init__()
        self.page_number = page_number
        self.xml_path = xml_path

    def run(self):
        download_and_save(self.page_number, self.xml_path)

if __name__ == '__main__':
    import time
    t0 = time.time()
    thread_list = []  # 用于维护每个线程的状态
    for i in range(1, 6):
        t = MyThread(i, 'data%s.xml' % i)
        t.start()
        thread_list.append(t)

    for t in thread_list:  # 等所有的子线程退出后,主线程再退出 
        t.join()   # 阻塞主线程,等待子线程退出再退出
    # for i in range(1, 6):
    #      download_and_save(i, 'data%s.xml' % i)
    print(time.time() - t0)
    print('main thread end.')

posted @ 2020-10-27 14:05  Richardo-M-Lu  阅读(149)  评论(0编辑  收藏  举报