python (1)一个简单的爬虫: python 在windows下 创建文件夹并写入文件

1.一个简单的爬虫:爬取豆瓣的热门电影的信息

写在前面:如何创建本来存在的文件夹并写入

1 t_path = "d:/py/inn"            #本来不存在inn,先定义路径,然后如果不存在,则创建目录,下面就能用了
2 if not os.path.exists(t_path):
3     os.makedirs(t_path)
4 f = open(r'd:/py/inn/info.txt','a')

 

技能:获取网页源码,正则表达式,函数调用,全局变量的定义

 1 #! /usr/bin/env python
 2 # -*- coding=utf-8 -*-
 3 import requests
 4 import json
 5 import re
 6 import sys
 7 reload(sys)
 8 sys.setdefaultencoding("utf-8")
 9 classinfo = []
10 f = open('info.txt','w')
11 
12 num = 0
13 def write(htm):
14     titl = re.findall('data-tit(.*?)data-enough',htm.text,re.S)
15     for each in titl:
16         #print each
17         info = {}
18         #print each
19         info['title'] = re.search('le="(.*?)"',each,re.S).group(1)
20         info['year'] = re.search('data-release="(.*?)" data',each,re.S).group(1)
21         info['Rating']= re.findall('data-rate="(.*?)" data-star',each,re.S)[0]
22         info['time'] = re.findall('data-duration="(.*?)" data-re',each,re.S)[0]
23         info['reg'] = re.findall('data-region="(.*?)" data-dir',each,re.S)[0]
24         info['act'] = re.findall('data-actors="(.*?)" data-in',each,re.S)[0]
25         global num #全局的定义
26         num = num + 1
27         f.writelines('%d\n' %num)
28         f.writelines(u'电影名:'+info['title'] + '\n')
29         f.writelines(u'主演:'+info['act'] + '\n')
30         f.writelines(u'电影地区:' + info['reg']+'\n')
31         f.writelines(u'上映年份:' + info['year']+'\n')
32         f.writelines(u'电影时长:' + info['time']+'\n')
33         f.writelines(u'评分:' + info['Rating']+'\n\n')
34 def getremen():
35     # html = requests.get('http://movie.douban.com/')
36     url = 'http://movie.douban.com/'
37     html = requests.get(url)
38     html.encoding = 'utf-8'
39     # print html.text
40     write(html)
41 if __name__ == "__main__":
42     getremen()

 

posted on 2016-01-18 19:18  细雨微光  阅读(2688)  评论(0编辑  收藏  举报