新闻类爬虫库:Newspaper
import newspaper from newspaper import Article from newspaper import fulltext url = 'https://www.wired.com/' paper = newspaper.build(url, language="en", memoize_articles=False)
<newspaper.source.Source object at 0x7fe82c98c1d0>
import newspaper from newspaper import Article from newspaper import fulltext url = 'https://www.wired.com/' paper = newspaper.build(url, language="en", memoize_articles=False) for article in paper.articles: print(article.url)
for category in paper.category_urls(): print(category)
article = Article('https://www.wired.com/story/preterm-babies-lonely-terror-of-a-pandemic-nicu/') article.download() article.parse() print("title=", article.title) print("author=", article.authors) print("publish_date=", article.publish_date) print("top_iamge=", article.top_image) print("movies=", article.movies) print("text=", article.text) print("summary=", article.summary)
first_url = paper.articles[0] first_url.download() first_url.parse() print(first_url.title) print(first_url.publish_date) print(first_url.authors) print(first_url.top_image) print(first_url.summary) print(first_url.movies) print(first_url.text)
html = requests.get('https://www.wired.com/story/preterm-babies-lonely-terror-of-a-pandemic-nicu/').text print('获取的原信息-->', html) text = fulltext(html, language='en') print('解析后的信息', text)
first_article = paper.articles[1] first_article.download() first_article.parse() first_article.nlp() print(first_article.summary) print(first_article.keywords)
import newspaper from newspaper import news_pool lr_paper = newspaper.build('https://lifehacker.com/', language="en") wd_paper = newspaper.build('https://www.wired.com/', language="en") ct_paper = newspaper.build('https://www.cnet.com/news/', language="en") papers = [lr_paper, wd_paper, ct_paper] # 线程数为 3 * 2 = 6 news_pool.set(papers, threads_per_source=2) news_pool.join() print(lr_paper.articles[0].html)
newspaper.hot() newspaper.popular_urls()
本文版权归作者所有,转载请注明出处http://www.cnblogs.com/iloverain/.未经作者同意必须保留此段声明,否则保留追究法律责任的权利.