python Scrapy -6 实战东莞民生网站
1 2 3 4 5 6 7 8 9 | import scrapy class DongguanItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() content = scrapy.Field() url = scrapy.Field() number = scrapy.Field() |
sun.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | # -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from dongguan.items import DongguanItem class SunSpider(CrawlSpider): name = 'sun' allowed_domains = [ 'wz.sun0769.com' ] start_urls = [ 'http://wz.sun0769.com/index.php/question/questionType?type=4&page=0' ] rules = ( Rule(LinkExtractor(allow = r 'type=4&page=\d+' )), Rule(LinkExtractor(allow = r '/html/question/\d+/\d+.shtml' ), callback = 'parse_item' ), ) def parse_item( self , response): item = DongguanItem() #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() item[ 'title' ] = response.xpath( '//div[contains(@class, "pagecenter p3")]//strong/text()' ).extract()[ 0 ] # 编号 item[ 'number' ] = item[ 'title' ].split( ' ' )[ - 1 ].split( ":" )[ - 1 ] # 内容 item[ 'content' ] = response.xpath( '//div[@class="c1 text14_2"]/text()' ).extract()[ 0 ] # 链接 item[ 'url' ] = response.url yield item |
pipelines.py
1 2 3 4 5 6 7 8 9 10 11 12 13 | import json class DongguanPipeline( object ): def __init__( self ): self .filename = open ( "dongguan.json" , "w" ) def process_item( self , item, spider): text = json.dumps( dict (item), ensure_ascii = False ) + ",\n" self .filename.write(text.encode( "utf-8" )) return item def close_spider( self , spider): self .filename.close() |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 别再用vector<bool>了!Google高级工程师:这可能是STL最大的设计失误
· 单元测试从入门到精通
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)