python src批量爬取

import requests, time
from lxml import etree

def src_tiqu(yeshu):
    for i in range(1,int(yeshu)+1):
        try:
            url = 'https://src.sjtu.edu.cn/list/?i=' + str(i)
            print('提取->',str(i)+'页数')
            data = requests.get(url).content
            # print(data)
            soup = etree.HTML(data.decode('utf-8'))
            result = soup.xpath('//td[@class=""]/a/text()')
            # print(result)

            results = '\n'.join(result)
            resultss = results.split()  #去空
            print(resultss)
            for edu in resultss:
                with open(r'src.txt', 'a+', encoding='utf-8') as f:
                    f.write(edu + '\n')
                    f.close()
        except Exception as e:
            time.sleep(0.5)
            pass


if __name__ == '__main__':
    yeshu = input("提取多少页:")
    src_tiqu(yeshu)

 

posted @ 2021-08-13 12:44  bingtanghulu  阅读(294)  评论(0编辑  收藏  举报