scrapy
# -*- coding: utf-8 -*-
import scrapy
import chardet
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.utils.url import urljoin_rfc
from scrapy.http import Request
class Greasemonkey1Spider(scrapy.Spider):
name = "greasemonkey1"
allowed_domains = ["wiki.greasespot.net"]
start_urls = (
'http://wiki.greasespot.net/',
)
def parse(self, response):
baseurl = response.url
print 'baseurl = ', baseurl
hxs = response.xpath(r'//a')
for path in hxs:
titles = path.xpath(r'text()').extract()
urls = path.xpath(r'@href').extract()
if len(titles) == 0:
continue
if len(urls) == 0:
continue
title = titles[0]
url = urls[0]
if title == '':
continue
if len(url) == 0:
continue
if url[0] == '#':
continue
print '2222', title, url
#
url2 = urljoin_rfc(baseurl, url)
print '=== ', url2
yield scrapy.Request(url2, callback=self.parse)