本文实例讲述了基于scrapy实现的简单蜘蛛采集程序。分享给大家供大家参考。具体如下:
# Standard Python library imports
# 3rd party imports
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
# My imports
from poetry_analysis.items import PoetryAnalysisItem
HTML_FILE_NAME = r\'.+\\.html\'
class PoetryParser(object):
\"\"\"
Provides common parsing method for poems formatted this one specific way.
\"\"\"
date_pattern = r\'(\\d{2} \\w{3,9} \\d{4})\'
def parse_poem(self, response):
hxs = HtmlXPathSelector(response)
item = PoetryAnalysisItem()
# All poetry text is in pre tags
text = hxs.select(\'//pre/text()\').extract()
item[\'text\'] = \'\'.join(text)
item[\'url\'] = response.url
# head/title contains title - a poem by author
title_text = hxs.select(\'//head/title/text()\').extract()[0]
item[\'title\'], item[\'author\'] = title_text.split(\' - \')
item[\'author\'] = item[\'author\'].replace(\'a poem by\', \'\')
for key in [\'title\', \'author\']:
item[key] = item[key].strip()
item[\'date\'] = hxs.select(\"//p[@class=\'small\']/text()\").re(date_pattern)
return item
class PoetrySpider(CrawlSpider, PoetryParser):
name = \'example.com_poetry\'
allowed_domains = [\'www.example.com\']
root_path = \'someuser/poetry/\'
start_urls = [\'http://www.example.com/someuser/poetry/recent/\',
\'http://www.example.com/someuser/poetry/less_recent/\']
rules = [Rule(SgmlLinkExtractor(allow=[start_urls[0] + HTML_FILE_NAME]),
callback=\'parse_poem\'),
Rule(SgmlLinkExtractor(allow=[start_urls[1] + HTML_FILE_NAME]),
callback=\'parse_poem\')]
希望本文所述对大家的Python程序设计有所帮助。