1 import requests
2 from bs4 import BeautifulSoup
3 import bs4
4 import re
5 import csv
6 #import pandas as pd
7 # r = requests.get("https://www.futurelearn.com/courses/climate-change-the-solutions,timeout = 30")
8 # r.encoding = r.apparent_encoding
9 # print(r.text)
10 def getHTMLText(url,code='utf-8'):
11 try:
12 r = requests.get(url,timeout = 30)
13 r.raise_for_status()
14 r.encoding = r.apparent_encoding
15 return r.text
16 except:
17 print("获取失败")
18 #爬取基本列表
19 def getCourseList(lst,html):
20 soup = BeautifulSoup(html,'html.parser')
21 div_href = soup.find_all('div',class_= 'm-grid-of-cards m-grid-of-cards--compact')
22 cop = re.compile('href="(.*?)">')
23 href_get = re.findall(cop,str(div_href))
24 m = r'" role="button'
25 z = ""
26 j = 'courses'
27 #print(href_get)
28 #print(type(href_get))
29 for i in href_get:
30 j = 'courses'
31 if j in i:
32 if m in i:
33 i=i.replace(m,z)
34 #print("https://www.futurelearn.com"+i)
35 else:
36 pass
37 #print("https://www.futurelearn.com"+i)
38 i = "https://www.futurelearn.com"+i
39 lst.append(i)
40 # print(lst)
41 else:
42 pass
43 def CourseList(lst):
44 list_1 = []
45 for i in lst:
46 list_ = []
47 url = i
48 r = requests.get(url,timeout = 30)
49
50 r.encoding = r.apparent_encoding
51 html=r.text
52 x = BeautifulSoup(html,'html.parser')
53
54 for i in x.find_all('h1',class_ = 'm-dual-billboard__heading'):
55 list_.append(i.text)
56 #print(list_)
57
58 for a in x.find_all('span',class_ = 'm-key-info__content'):
59 list_.append(a.text)
60 print(list_)
61
62 for b in x.find('div',class_ = "a-content a-content--tight").find_all("p"):
63 #print(a.text)
64 list_.append(b.text)
65 print(list_)
66
67 c = x.find('section',class_ = 'a-section a-section--alt-adjacent').find_all('p'):
68 list_.append(c.text)
69 print(list_)
70 # list_1.append(list_)
71 # print(list_1)
72
73
74 # def write_dictionary_to_csv(list_1,filename):
75 # file_name='{}.csv'.format(filename)
76 # name = ['课程名','课时','学习任务','课程性质','额外费用','介绍','话题','开始时间','服务对象','']
77 # test = pd.DataFrame(columns = name,data = list_1)
78 # test.to_csv(file_name)
79
80 def main():
81 star_url = "https://www.futurelearn.com/courses/categories/science-engineering-and-maths-courses?all_courses=1"
82 infoList = []
83 url = star_url
84 file_name_ = 'courses'
85 html = getHTMLText(url)
86 getCourseList(infoList,html)
87
88 CourseList(infoList)
89 #write_dictionary_to_csv(infoList,file_name_)
90
91 main()