心血来潮写了个多线程抓妹子图,虽然代码还是有一些瑕疵,但是还是记录下来,分享给大家。
Pic_downloader.py
# -*- coding: utf-8 -*- \"\"\" Created on Fri Aug 07 17:30:58 2015 @author: Dreace \"\"\" import urllib2 import sys import time import os import random from multiprocessing.dummy import Pool as ThreadPool type_ = sys.getfilesystemencoding() def rename(): return time.strftime(\"%Y%m%d%H%M%S\") def rename_2(name): if len(name) == 2: name = \'0\' + name + \'.jpg\' elif len(name) == 1: name = \'00\' + name + \'.jpg\' else: name = name + \'.jpg\' return name def download_pic(i): global count global time_out if Filter(i): try: content = urllib2.urlopen(i,timeout = time_out) url_content = content.read() f = open(repr(random.randint(10000,999999999)) + \"_\" + rename_2(repr(count)),\"wb\") f.write(url_content) f.close() count += 1 except Exception, e: print i + \"下载超时,跳过!\".decode(\"utf-8\").encode(type_) def Filter(content): for line in Filter_list: line=line.strip(\'\\n\') if content.find(line) == -1: return True def get_pic(url_address): global pic_list try: str_ = urllib2.urlopen(url_address, timeout = time_out).read() url_content = str_.split(\"\\\"\") for i in url_content: if i.find(\".jpg\") != -1: pic_list.append(i) except Exception, e: print \"获取图片超时,跳过!\".decode(\"utf-8\").encode(type_) MAX = 2 count = 0 time_out = 60 thread_num = 30 pic_list = [] page_list = [] Filter_list = [\"imgsize.ph.126.net\",\"img.ph.126.net\",\"img2.ph.126.net\"] dir_name = \"C:\\Photos\\\\\"+rename() os.makedirs(dir_name) os.chdir(dir_name) start_time = time.time() url_address = \"http://sexy.faceks.com/?page=\" for i in range(1,MAX + 1): page_list.append(url_address + repr(i)) page_pool = ThreadPool(thread_num) page_pool.map(get_pic,page_list) print \"获取到\".decode(\"utf-8\").encode(type_),len(pic_list),\"张图片,开始下载!\".decode(\"utf-8\").encode(type_) pool = ThreadPool(thread_num) pool.map(download_pic,pic_list) pool.close() pool.join() print count,\"张图片保存在\".decode(\"utf-8\").encode(type_) + dir_name print \"共耗时\".decode(\"utf-8\").encode(type_),time.time() - start_time,\"s\"
我们来看下一个网友的作品
#coding: utf-8 ############################################################# # File Name: main.py # Author: mylonly # mail: mylonly@gmail.com # Created Time: Wed 11 Jun 2014 08:22:12 PM CST ######################################################################### #!/usr/bin/python import re,urllib2,HTMLParser,threading,Queue,time #各图集入口链接 htmlDoorList = [] #包含图片的Hmtl链接 htmlUrlList = [] #图片Url链接Queue imageUrlList = Queue.Queue(0) #捕获图片数量 imageGetCount = 0 #已下载图片数量 imageDownloadCount = 0 #每个图集的起始地址,用于判断终止 nextHtmlUrl = \'\' #本地保存路径 localSavePath = \'/data/1920x1080/\' #如果你想下你需要的分辨率的,请修改replace_str,有如下分辨率可供选择1920x1200,1980x1920,1680x1050,1600x900,1440x900,1366x768,1280x1024,1024x768,1280x800 replace_str = \'1920x1080\' replaced_str = \'960x600\' #内页分析处理类 class ImageHtmlParser(HTMLParser.HTMLParser): def __init__(self): self.nextUrl = \'\' HTMLParser.HTMLParser.__init__(self) def handle_starttag(self,tag,attrs): global imageUrlList if(tag == \'img\' and len(attrs) > 2 ): if(attrs[0] == (\'id\',\'bigImg\')): url = attrs[1][1] url = url.replace(replaced_str,replace_str) imageUrlList.put(url) global imageGetCount imageGetCount = imageGetCount + 1 print url elif(tag == \'a\' and len(attrs) == 4): if(attrs[0] == (\'id\',\'pageNext\') and attrs[1] == (\'class\',\'next\')): global nextHtmlUrl nextHtmlUrl = attrs[2][1]; #首页分析类 class IndexHtmlParser(HTMLParser.HTMLParser): def __init__(self): self.urlList = [] self.index = 0 self.nextUrl = \'\' self.tagList = [\'li\',\'a\'] self.classList = [\'photo-list-padding\',\'pic\'] HTMLParser.HTMLParser.__init__(self) def handle_starttag(self,tag,attrs): if(tag == self.tagList[self.index]): for attr in attrs: if (attr[1] == self.classList[self.index]): if(self.index == 0): #第一层找到了 self.index = 1 else: #第二层找到了 self.index = 0 print attrs[1][1] self.urlList.append(attrs[1][1]) break elif(tag == \'a\'): for attr in attrs: if (attr[0] == \'id\' and attr[1] == \'pageNext\'): self.nextUrl = attrs[1][1] print \'nextUrl:\',self.nextUrl break #首页Hmtl解析器 indexParser = IndexHtmlParser() #内页Html解析器 imageParser = ImageHtmlParser() #根据首页得到所有入口链接 print \'开始扫描首页...\' host = \'http://desk.zol.com.cn\' indexUrl = \'/meinv/\' while (indexUrl != \'\'): print \'正在抓取网页:\',host+indexUrl request = urllib2.Request(host+indexUrl) try: m = urllib2.urlopen(request) con = m.read() indexParser.feed(con) if (indexUrl == indexParser.nextUrl): break else: indexUrl = indexParser.nextUrl except urllib2.URLError,e: print e.reason print \'首页扫描完成,所有图集链接已获得:\' htmlDoorList = indexParser.urlList #根据入口链接得到所有图片的url class getImageUrl(threading.Thread): def __init__(self): threading.Thread.__init__(self) def run(self): for door in htmlDoorList: print \'开始获取图片地址,入口地址为:\',door global nextHtmlUrl nextHtmlUrl = \'\' while(door != \'\'): print \'开始从网页%s获取图片...\'% (host+door) if(nextHtmlUrl != \'\'): request = urllib2.Request(host+nextHtmlUrl) else: request = urllib2.Request(host+door) try: m = urllib2.urlopen(request) con = m.read() imageParser.feed(con) print \'下一个页面地址为:\',nextHtmlUrl if(door == nextHtmlUrl): break except urllib2.URLError,e: print e.reason print \'所有图片地址均已获得:\',imageUrlList class getImage(threading.Thread): def __init__(self): threading.Thread.__init__(self) def run(self): global imageUrlList print \'开始下载图片...\' while(True): print \'目前捕获图片数量:\',imageGetCount print \'已下载图片数量:\',imageDownloadCount image = imageUrlList.get() print \'下载文件路径:\',image try: cont = urllib2.urlopen(image).read() patter = \'[0-9]*\\.jpg\'; match = re.search(patter,image); if match: print \'正在下载文件:\',match.group() filename = localSavePath+match.group() f = open(filename,\'wb\') f.write(cont) f.close() global imageDownloadCount imageDownloadCount = imageDownloadCount + 1 else: print \'no match\' if(imageUrlList.empty()): break except urllib2.URLError,e: print e.reason print \'文件全部下载完成...\' get = getImageUrl() get.start() print \'获取图片链接线程启动:\' time.sleep(2) download = getImage() download.start() print \'下载图片链接线程启动:\'
批量抓取指定网页上的所有图片
# -*- coding:utf-8 -*- # coding=UTF-8 import os,urllib,urllib2,re url = u\"http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=python&oq=python&rsp=-1\" outpath = \"t:\\\\\" def getHtml(url): webfile = urllib.urlopen(url) outhtml = webfile.read() print outhtml return outhtml def getImageList(html): restr=ur\'(\' restr+=ur\'http:\\/\\/[^\\s,\"]*\\.jpg\' restr+=ur\'|http:\\/\\/[^\\s,\"]*\\.jpeg\' restr+=ur\'|http:\\/\\/[^\\s,\"]*\\.png\' restr+=ur\'|http:\\/\\/[^\\s,\"]*\\.gif\' restr+=ur\'|http:\\/\\/[^\\s,\"]*\\.bmp\' restr+=ur\'|https:\\/\\/[^\\s,\"]*\\.jpeg\' restr+=ur\'|https:\\/\\/[^\\s,\"]*\\.jpeg\' restr+=ur\'|https:\\/\\/[^\\s,\"]*\\.png\' restr+=ur\'|https:\\/\\/[^\\s,\"]*\\.gif\' restr+=ur\'|https:\\/\\/[^\\s,\"]*\\.bmp\' restr+=ur\')\' htmlurl = re.compile(restr) imgList = re.findall(htmlurl,html) print imgList return imgList def download(imgList, page): x = 1 for imgurl in imgList: filepathname=str(outpath+\'pic_%09d_%010d\'%(page,x)+str(os.path.splitext(urllib2.unquote(imgurl).decode(\'utf8\').split(\'/\')[-1])[1])).lower() print \'[Debug] Download file :\'+ imgurl+\' >> \'+filepathname urllib.urlretrieve(imgurl,filepathname) x+=1 def downImageNum(pagenum): page = 1 pageNumber = pagenum while(page <= pageNumber): html = getHtml(url)#获得url指向的html内容 imageList = getImageList(html)#获得所有图片的地址,返回列表 download(imageList,page)#下载所有的图片 page = page+1 if __name__ == \'__main__\': downImageNum(1)
以上就是给大家汇总的3款Python实现的批量抓取妹纸图片的代码了,希望对大家学习Python爬虫能够有所帮助。