python Scrapy -6 实战东莞民生网站 2
dongdong.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | # -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from newdongguan.items import NewdongguanItem class DongdongSpider(CrawlSpider): name = 'dongdong' allowed_domains = [ 'wz.sun0769.com' ] start_urls = [ 'http://wz.sun0769.com/index.php/question/questionType?type=4&page=' ] # 每一页的匹配规则 pagelink = LinkExtractor(allow = ( "type=4" )) # 每一页里的每个帖子的匹配规则 contentlink = LinkExtractor(allow = (r "/html/question/\d+/\d+.shtml" )) rules = ( # 本案例的url被web服务器篡改,需要调用process_links来处理提取出来的url Rule(pagelink, process_links = "deal_links" ), Rule(contentlink, callback = "parse_item" ) ) # links 是当前response里提取出来的链接列表 def deal_links( self , links): for each in links: each.url = each.url.replace( "?" , "&" ).replace( "Type&" , "Type?" ) return links def parse_item( self , response): item = NewdongguanItem() # 标题 item[ 'title' ] = response.xpath( '//div[contains(@class, "pagecenter p3")]//strong/text()' ).extract()[ 0 ] # 编号 item[ 'number' ] = item[ 'title' ].split( ' ' )[ - 1 ].split( ":" )[ - 1 ] # 内容,先使用有图片情况下的匹配规则,如果有内容,返回所有内容的列表集合 content = response.xpath( '//div[@class="contentext"]/text()' ).extract() # 如果没有内容,则返回空列表,则使用无图片情况下的匹配规则 if len (content) = = 0 : content = response.xpath( '//div[@class="c1 text14_2"]/text()' ).extract() item[ 'content' ] = "".join(content).strip() else : item[ 'content' ] = "".join(content).strip() # 链接 item[ 'url' ] = response.url yield item |
items.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | # -*- coding: utf-8 -*- import scrapy class NewdongguanItem(scrapy.Item): # define the fields for your item here like: # 标题 title = scrapy.Field() # 编号 number = scrapy.Field() # 内容 content = scrapy.Field() # 链接 url = scrapy.Field() |
pipelines.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | # -*- coding: utf-8 -*- import codecs import json class NewdongguanPipeline( object ): def __init__( self ): # 创建一个文件 self .filename = codecs. open ( "donggguan.json" , "w" , encoding = "utf-8" ) def process_item( self , item, spider): # 中文默认使用ascii码来存储,禁用后默认为Unicode字符串 content = json.dumps( dict (item), ensure_ascii = False ) + "\n" self .filename.write(content) return item def close_spider( self , spider): self .filename.close() |
settings.py
1 2 3 4 5 6 7 8 9 10 11 12 13 | ITEM_PIPELINES = { 'newdongguan.pipelines.NewdongguanPipeline' : 1 , } BOT_NAME = 'newdongguan' SPIDER_MODULES = [ 'newdongguan.spiders' ] NEWSPIDER_MODULE = 'newdongguan.spiders' USER_AGENT = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;" # Obey robots.txt rules ROBOTSTXT_OBEY = True |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)