本文实例讲述了基于scrapy实现的简单蜘蛛采集程序。分享给大家供大家参考。具体如下:
# Standard Python library imports # 3rd party imports from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector # My imports from poetry_analysis.items import PoetryAnalysisItem HTML_FILE_NAME = r\'.+\\.html\' class PoetryParser(object): \"\"\" Provides common parsing method for poems formatted this one specific way. \"\"\" date_pattern = r\'(\\d{2} \\w{3,9} \\d{4})\' def parse_poem(self, response): hxs = HtmlXPathSelector(response) item = PoetryAnalysisItem() # All poetry text is in pre tags text = hxs.select(\'//pre/text()\').extract() item[\'text\'] = \'\'.join(text) item[\'url\'] = response.url # head/title contains title - a poem by author title_text = hxs.select(\'//head/title/text()\').extract()[0] item[\'title\'], item[\'author\'] = title_text.split(\' - \') item[\'author\'] = item[\'author\'].replace(\'a poem by\', \'\') for key in [\'title\', \'author\']: item[key] = item[key].strip() item[\'date\'] = hxs.select(\"//p[@class=\'small\']/text()\").re(date_pattern) return item class PoetrySpider(CrawlSpider, PoetryParser): name = \'example.com_poetry\' allowed_domains = [\'www.example.com\'] root_path = \'someuser/poetry/\' start_urls = [\'http://www.example.com/someuser/poetry/recent/\', \'http://www.example.com/someuser/poetry/less_recent/\'] rules = [Rule(SgmlLinkExtractor(allow=[start_urls[0] + HTML_FILE_NAME]), callback=\'parse_poem\'), Rule(SgmlLinkExtractor(allow=[start_urls[1] + HTML_FILE_NAME]), callback=\'parse_poem\')]
希望本文所述对大家的Python程序设计有所帮助。