基本模块
python爬虫,web spider。爬取网站获取网页数据,并进行分析提取。
基本模块使用的是 urllib,urllib2,re,等模块
基本用法,例子:
(1)进行基本GET请求,获取网页html
#!coding=utf-8 import urllib import urllib2 url = \'http://www.baidu.com/\' # 获取请求 request = urllib2.Request(url) try: # 根据request,得到返回response response = urllib2.urlopen(request) except urllib2.HTTPError, e: if hasattr(e, \'reason\'): print e.reason # 读取response的body html = response.read() # 读取response的headers headers = response.info()
(2)表单提交
#!coding=utf-8 import urllib2 import urllib post_url = \'\' post_data = urllib.urlencode({ \'username\': \'username\', \'password\': \'password\', }) post_headers = { \'User-Agent\': \'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0\', } request = urllib2.Request( url=post_url, data=post_data, headers=post_headers, ) response = urllib2.urlopen(request) html = response.read()
(3)
#!coding=utf-8 import urllib2 import re page_num = 1 url = \'http://tieba.baidu.com/p/3238280985?see_lz=1&pn=\'+str(page_num) myPage = urllib2.urlopen(url).read().decode(\'gbk\') myRe = re.compile(r\'class=\"d_post_content j_d_post_content \">(.*?)
(4)
#coding:utf-8 \'\'\' 模拟登陆163邮箱并下载邮件内容 \'\'\' import urllib import urllib2 import cookielib import re import time import json class Email163: header = {\'User-Agent\':\'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6\'} user = \'\' cookie = None sid = None mailBaseUrl=\'http://twebmail.mail.163.com\' def __init__(self): self.cookie = cookielib.CookieJar() cookiePro = urllib2.HTTPCookieProcessor(self.cookie) urllib2.install_opener(urllib2.build_opener(cookiePro)) def login(self,user,pwd): \'\'\' 登录 \'\'\' postdata = urllib.urlencode({ \'username\':user, \'password\':pwd, \'type\':1 }) #注意版本不同,登录URL也不同 req = urllib2.Request( url=\'https://ssl.mail.163.com/entry/coremail/fcg/ntesdoor2?funcid=loginone&language=-1&passtype=1&iframe=1&product=mail163&from=web&df=email163&race=-2_45_-2_hz&module=&uid=\'+user+\'&style=10&net=t&skinid=null\', data=postdata, headers=self.header, ) res = str(urllib2.urlopen(req).read()) #print res patt = re.compile(\'sid=([^\"]+)\',re.I) patt = patt.search(res) uname = user.split(\'@\')[0] self.user = user if patt: self.sid = patt.group(1).strip() #print self.sid print \'%s Login Successful.....\'%(uname) else: print \'%s Login failed....\'%(uname) def getInBox(self): \'\'\' 获取邮箱列表 \'\'\' print \'\\nGet mail lists.....\\n\' sid = self.sid url = self.mailBaseUrl+\'/jy3/list/list.do?sid=\'+sid+\'&fid=1&fr=folder\' res = urllib2.urlopen(url).read() #获取邮件列表 mailList = [] patt = re.compile(\']+>.*?href=\"([^\"]+)\"[^>]+>(.*?)<\\/a>.*?]+>.*?href=\"[^>]+>(.*?)<\\/a>\',re.I|re.S) patt = patt.findall(res) if patt==None: return mailList for i in patt: line = { \'from\':i[1].decode(\'utf8\'), \'url\':self.mailBaseUrl+i[0], \'subject\':i[2].decode(\'utf8\') } mailList.append(line) return mailList def getMailMsg(self,url): \'\'\' 下载邮件内容 \'\'\' content=\'\' print \'\\n Download.....%s\\n\'%(url) res = urllib2.urlopen(url).read() patt = re.compile(\'contentURL:\"([^\"]+)\"\',re.I) patt = patt.search(res) if patt==None: return content url = \'%s%s\'%(self.mailBaseUrl,patt.group(1)) time.sleep(1) res = urllib2.urlopen(url).read() Djson = json.JSONDecoder(encoding=\'utf8\') jsonRes = Djson.decode(res) if \'resultVar\' in jsonRes: content = Djson.decode(res)[\'resultVar\'] time.sleep(3) return content \'\'\' Demon \'\'\' #初始化 mail163 = Email163() #登录 mail163.login(\'lpe234@163.com\',\'944898186\') time.sleep(2) #获取收件箱 elist = mail163.getInBox() #获取邮件内容 for i in elist: print \'主题:%s 来自:%s 内容:\\n%s\'%(i[\'subject\'].encode(\'utf8\'),i[\'from\'].encode(\'utf8\'),mail163.getMailMsg(i[\'url\']).encode(\'utf8\'))(5)需要登陆的情况
#1 cookie的处理 import urllib2, cookielib cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar()) opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) urllib2.install_opener(opener) content = urllib2.urlopen(\'http://XXXX\').read() #2 用代理和cookie opener = urllib2.build_opener(proxy_support, cookie_support, urllib2.HTTPHandler) #3 表单的处理 import urllib postdata=urllib.urlencode({ \'username\':\'XXXXX\', \'password\':\'XXXXX\', \'continueURI\':\'http://www.verycd.com/\', \'fk\':fk, \'login_submit\':\'登录\' }) req = urllib2.Request( url = \'http://secure.verycd.com/signin/*/http://www.verycd.com/\', data = postdata ) result = urllib2.urlopen(req).read() #4 伪装成浏览器访问 headers = { \'User-Agent\':\'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6\' } req = urllib2.Request( url = \'http://secure.verycd.com/signin/*/http://www.verycd.com/\', data = postdata, headers = headers ) #5 反”反盗链” headers = { \'Referer\':\'http://www.cnbeta.com/articles\' }(6)多线程
from threading import Thread from Queue import Queue from time import sleep #q是任务队列 #NUM是并发线程总数 #JOBS是有多少任务 q = Queue() NUM = 2 JOBS = 10 #具体的处理函数,负责处理单个任务 def do_somthing_using(arguments): print arguments #这个是工作进程,负责不断从队列取数据并处理 def working(): while True: arguments = q.get() do_somthing_using(arguments) sleep(1) q.task_done() #fork NUM个线程等待队列 for i in range(NUM): t = Thread(target=working) t.setDaemon(True) t.start() #把JOBS排入队列 for i in range(JOBS): q.put(i) #等待所有JOBS完成 q.join()scrapy框架
Scrapy框架,Python开发的一个快速,高层次的屏幕抓取和web抓取框架,用于抓取web站点并从页面中提取结构化的数据。Scrapy用途广泛,可以用于数据挖掘、监测和自动化测试。刚开始学习这个框架。不太好评论。只是感觉这个框架有些Java的感觉,需要太多的其他模块的支持。
(一)创建 scrapy 项目
# 使用 scrapy startproject scrapy_test ├── scrapy_test │ ├── scrapy.cfg │ └── scrapy_test │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py # 进行创建 scrapy 项目(二)说明
scrapy.cfg: 项目配置文件
items.py: 需要提取的数据结构定义文件
pipelines.py:管道定义,用来对items里面提取的数据做进一步处理,如保存等
settings.py: 爬虫配置文件
spiders: 放置spider的目录
(三)依赖包依赖包比较麻烦。
# python-dev 包的安装 apt-get install python-dev # twisted, w3lib, six, queuelib, cssselect, libxslt pip install w3lib pip install twisted pip install lxml apt-get install libxml2-dev libxslt-dev apt-get install python-lxml pip install cssselect pip install pyOpenSSL sudo pip install service_identity # 安装好之后,便可使用 scrapy startproject test 进行创建项目(四)抓取实例。
(1)创建scrapy项目dizzy@dizzy-pc:~/Python/spit$ scrapy startproject itzhaopin New Scrapy project \'itzhaopin\' created in: /home/dizzy/Python/spit/itzhaopin You can start your first spider with: cd itzhaopin scrapy genspider example example.com dizzy@dizzy-pc:~/Python/spit$ dizzy@dizzy-pc:~/Python/spit$ cd itzhaopin dizzy@dizzy-pc:~/Python/spit/itzhaopin$ tree . ├── itzhaopin │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ └── __init__.py └── scrapy.cfg # scrapy.cfg: 项http://my.oschina.net/lpe234/admin/new-blog目配置文件 # items.py: 需要提取的数据结构定义文件 # pipelines.py:管道定义,用来对items里面提取的数据做进一步处理,如保存等 # settings.py: 爬虫配置文件 # spiders: 放置spider的目录(2)定义要抓取的数据结构 items.py
from scrapy.item import Item, Field # 定义我们要抓取的数据 class TencentItem(Item): name = Field() # 职位名称 catalog = Field() # 职位类别 workLocation = Field() # 工作地点 recruitNumber = Field() # 招聘人数 detailLink = Field() # 职位详情链接 publishTime = Field() # 发布时间(3)实现Spider类
- Spider是继承自 scarpy.contrib.spiders.CrawlSpider 的Python类,有3个必须定义的成员。
- name : 名称,spider的标识。
- start_urls : 一个url列表,spider从这些网页开始抓取
- parse() : 一个方法。当start_urls里面的网页抓取下来之后需要调用这个方法来解析网页内容,同时需要返回下一个需要抓取的网页,或者返回items列表。
在spiders目录下面新建一个spider,tencent_spider.py :
#coding=utf-8 from scrapy.spider import BaseSpider class DmozSpider(BaseSpider): name = \'dmoz\' allowed_domains = [\'dmoz.org\'] start_urls = [ \'http://www.dmoz.org/Computers/Programming/Languages/Python/Books/\', \'http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/\' ] def parse(self, response): filename = response.url.split(\'/\')[-2] open(filename, \'wb\').write(response.info)这个简单一些。 使用scrapy crawl dmoz # 即可运行spider
相关内容