xpath的常见操作

1. 获取某一个节点下所有的文本数据:

data = response.xpath('//div[@id="zoomcon"]')
content = ''.join(data.xpath('string(.)').extract())

这段代码将获取,div为某一个特定id的所有文本数据:

image

 

http://www.nhfpc.gov.cn/fzs/s3576/200804/cdbda975a377456a82337dfe1cf176a1.shtml 

image

 

2. 获取html几点属性的值

>>> response.xpath("//div[@id='zoomtime']").extract()      
[u'<div class="content_subtitle" id="zoomtime" title="\u53d1\u5e03\u65e5\u671f\uff1a2010-10-26"><span>\u4e2d\u534e\u4eba\u6c11\u5171\u548c\u56fd\u56fd\u5bb6\u536b\u751f\u548c\u8ba1\u5212\u751f\u80b2\u59d4\u5458\u4f1a</span><span class="wzurl_tt" style="margin-left:10px;"></span><span style="margin-left:10px;">2010-10-26</span>\r\n                <span style="margin-left:30px;"></span> </div>']
>>> response.xpath("//div[@id='zoomtime']/@title").extract()
[u'\u53d1\u5e03\u65e5\u671f\uff1a2010-10-26']

这里需要获取的是某一个id下,属性title的值,使用的@title就可以获取到:

image

 

scrapy的项目结构:

image

 

 

nhfpc.py

# -*- coding: utf-8 -*-
import scrapy
import sys
import hashlib
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from datetime import *
from common_lib import *

reload(sys)
sys.setdefaultencoding('utf-8')

class NhfpcItem(scrapy.Item):
    url = scrapy.Field()
    name = scrapy.Field()
    description = scrapy.Field()
    size = scrapy.Field()
    dateTime = scrapy.Field()
    


class NhfpcSpider(scrapy.contrib.spiders.CrawlSpider):
    name = "nhfpc"
    allowed_domains = ["nhfpc.gov.cn"]
    start_urls = (
        'http://www.nhfpc.gov.cn/fzs/pzcfg/list.shtml',
        'http://www.nhfpc.gov.cn/fzs/pzcfg/list_2.shtml',
        'http://www.nhfpc.gov.cn/fzs/pzcfg/list_3.shtml',
        'http://www.nhfpc.gov.cn/fzs/pzcfg/list_4.shtml',
        'http://www.nhfpc.gov.cn/fzs/pzcfg/list_5.shtml',
        'http://www.nhfpc.gov.cn/fzs/pzcfg/list_6.shtml',
        'http://www.nhfpc.gov.cn/fzs/pzcfg/list_7.shtml',
    )
    

    rules = (
        Rule(
            LinkExtractor(allow='.*\d{6}/.*'),
            callback='parse_item'
        ),
        Rule(
            LinkExtractor(allow='.*201307.*'),
            follow=True,
        ),
    )

    def parse_item(self, response):
        
        retList =  response.xpath("//div[@id='zoomtitle']/*/text()").extract()
        title = ""
        
        if len(retList) == 0: 
            retList = response.xpath("//div[@id='zoomtitl']/*/text()").extract()
            title =  retList[0].strip()
        else:
            title = retList[0].strip()
        

        content = ""
        data = response.xpath('//div[@id="zoomcon"]')

        if len(data) == 0: 
            data = response.xpath('//div[@id="contentzoom"]')
        content = ''.join(data.xpath('string(.)').extract())


        pubTime = "1970-01-01 00:00:00"
        time = response.xpath("//div[@id='zoomtime']/@title").extract()

        if len(time) == 0 :
            time = response.xpath("//ucmspubtime/text()").extract()
        else:
            time = ''.join(time).split("")[1]

        pubTime = ''.join(time)
        pubTime = pubTime + " 00:00:00"
        #print pubTime

        #insertTime = datetime.now().strftime("%20y-%m-%d %H:%M:%S")
        insertTime = datetime.now()
        webSite = "nhfpc.gov.cn"
        
        values = []
        values.append(title)

        md5Url=hashlib.md5(response.url.encode('utf-8')).hexdigest()
        
        values.append(md5Url)
        values.append(pubTime)
        values.append(insertTime)
        values.append(webSite)
        values.append(content)
        values.append(response.url)
        #print values
        insertDB(values)

 

common_lib.py

#!/usr/bin/python
#-*-coding:utf-8-*-

'''
This file include all the common routine,that are needed in
the crawler project.
Author: Justnzhang @(uestczhangchao@qq.com)
Time:2014年7月28日15:03:44
'''
import os
import sys
import MySQLdb
from urllib import quote, unquote
import uuid

reload(sys)
sys.setdefaultencoding('utf-8')

def insertDB(dictData):
    print "insertDB"
    print dictData
    id = uuid.uuid1()
    try:
        conn_local = MySQLdb.connect(host='192.168.30.7',user='xxx',passwd='xxx',db='xxx',port=3306)
        conn_local.set_character_set('utf8')
        cur_local = conn_local.cursor()
        cur_local.execute('SET NAMES utf8;') 
        cur_local.execute('SET CHARACTER SET utf8;')
        cur_local.execute('SET character_set_connection=utf8;')                

        values = []
#        print values


        values.append("2")
        values.append("3")
        values.append("2014-04-11 00:00:00")
        values.append("2014-04-11 00:00:00")
        values.append("6")
        values.append("7")
        
        cur_local.execute("insert into health_policy values(NULL,%s,%s,%s,%s,%s,%s)",values)
        #print "invinsible seperator line-----------------------------------"
        conn_local.commit()
        cur_local.close()
        conn_local.close()
    except MySQLdb.Error,e:
        print "Mysql Error %d: %s" % (e.args[0], e.args[1])


if __name__ == '__main__':
    values = [1,2,4]
    insertDB(values)

 

SET FOREIGN_KEY_CHECKS=0;

-- ----------------------------
-- Table structure for health_policy
-- ----------------------------
DROP TABLE IF EXISTS `health_policy`;
CREATE TABLE `health_policy` (
  `hid` int(11) NOT NULL AUTO_INCREMENT,
  `title` varchar(1000) DEFAULT NULL COMMENT '政策标题',
  `md5url` varchar(1000) NOT NULL COMMENT '经过MD5加密后的URL',
  `pub_time` datetime DEFAULT NULL COMMENT '发布时间',
  `inser_time` datetime NOT NULL COMMENT '插入时间',
  `website` varchar(1000) DEFAULT NULL COMMENT '来源网站',
  `content` longtext COMMENT '政策内容',
  `url` varchar(1000) DEFAULT NULL,
  PRIMARY KEY (`hid`)
) ENGINE=InnoDB AUTO_INCREMENT=594 DEFAULT CHARSET=utf8;
posted @ 2015-05-06 15:29  justinzhang  阅读(2424)  评论(0编辑  收藏  举报