心血来潮写了个多线程抓妹子图,虽然代码还是有一些瑕疵,但是还是记录下来,分享给大家。
Pic_downloader.py
# -*- coding: utf-8 -*-
\"\"\"
Created on Fri Aug 07 17:30:58 2015
@author: Dreace
\"\"\"
import urllib2
import sys
import time
import os
import random
from multiprocessing.dummy import Pool as ThreadPool
type_ = sys.getfilesystemencoding()
def rename():
return time.strftime(\"%Y%m%d%H%M%S\")
def rename_2(name):
if len(name) == 2:
name = \'0\' + name + \'.jpg\'
elif len(name) == 1:
name = \'00\' + name + \'.jpg\'
else:
name = name + \'.jpg\'
return name
def download_pic(i):
global count
global time_out
if Filter(i):
try:
content = urllib2.urlopen(i,timeout = time_out)
url_content = content.read()
f = open(repr(random.randint(10000,999999999)) + \"_\" + rename_2(repr(count)),\"wb\")
f.write(url_content)
f.close()
count += 1
except Exception, e:
print i + \"下载超时,跳过!\".decode(\"utf-8\").encode(type_)
def Filter(content):
for line in Filter_list:
line=line.strip(\'\\n\')
if content.find(line) == -1:
return True
def get_pic(url_address):
global pic_list
try:
str_ = urllib2.urlopen(url_address, timeout = time_out).read()
url_content = str_.split(\"\\\"\")
for i in url_content:
if i.find(\".jpg\") != -1:
pic_list.append(i)
except Exception, e:
print \"获取图片超时,跳过!\".decode(\"utf-8\").encode(type_)
MAX = 2
count = 0
time_out = 60
thread_num = 30
pic_list = []
page_list = []
Filter_list = [\"imgsize.ph.126.net\",\"img.ph.126.net\",\"img2.ph.126.net\"]
dir_name = \"C:\\Photos\\\\\"+rename()
os.makedirs(dir_name)
os.chdir(dir_name)
start_time = time.time()
url_address = \"http://sexy.faceks.com/?page=\"
for i in range(1,MAX + 1):
page_list.append(url_address + repr(i))
page_pool = ThreadPool(thread_num)
page_pool.map(get_pic,page_list)
print \"获取到\".decode(\"utf-8\").encode(type_),len(pic_list),\"张图片,开始下载!\".decode(\"utf-8\").encode(type_)
pool = ThreadPool(thread_num)
pool.map(download_pic,pic_list)
pool.close()
pool.join()
print count,\"张图片保存在\".decode(\"utf-8\").encode(type_) + dir_name
print \"共耗时\".decode(\"utf-8\").encode(type_),time.time() - start_time,\"s\"
我们来看下一个网友的作品
#coding: utf-8 ############################################################# # File Name: main.py # Author: mylonly # mail: mylonly@gmail.com # Created Time: Wed 11 Jun 2014 08:22:12 PM CST ######################################################################### #!/usr/bin/python import re,urllib2,HTMLParser,threading,Queue,time #各图集入口链接 htmlDoorList = [] #包含图片的Hmtl链接 htmlUrlList = [] #图片Url链接Queue imageUrlList = Queue.Queue(0) #捕获图片数量 imageGetCount = 0 #已下载图片数量 imageDownloadCount = 0 #每个图集的起始地址,用于判断终止 nextHtmlUrl = \'\' #本地保存路径 localSavePath = \'/data/1920x1080/\' #如果你想下你需要的分辨率的,请修改replace_str,有如下分辨率可供选择1920x1200,1980x1920,1680x1050,1600x900,1440x900,1366x768,1280x1024,1024x768,1280x800 replace_str = \'1920x1080\' replaced_str = \'960x600\' #内页分析处理类 class ImageHtmlParser(HTMLParser.HTMLParser): def __init__(self): self.nextUrl = \'\' HTMLParser.HTMLParser.__init__(self) def handle_starttag(self,tag,attrs): global imageUrlList if(tag == \'img\' and len(attrs) > 2 ): if(attrs[0] == (\'id\',\'bigImg\')): url = attrs[1][1] url = url.replace(replaced_str,replace_str) imageUrlList.put(url) global imageGetCount imageGetCount = imageGetCount + 1 print url elif(tag == \'a\' and len(attrs) == 4): if(attrs[0] == (\'id\',\'pageNext\') and attrs[1] == (\'class\',\'next\')): global nextHtmlUrl nextHtmlUrl = attrs[2][1]; #首页分析类 class IndexHtmlParser(HTMLParser.HTMLParser): def __init__(self): self.urlList = [] self.index = 0 self.nextUrl = \'\' self.tagList = [\'li\',\'a\'] self.classList = [\'photo-list-padding\',\'pic\'] HTMLParser.HTMLParser.__init__(self) def handle_starttag(self,tag,attrs): if(tag == self.tagList[self.index]): for attr in attrs: if (attr[1] == self.classList[self.index]): if(self.index == 0): #第一层找到了 self.index = 1 else: #第二层找到了 self.index = 0 print attrs[1][1] self.urlList.append(attrs[1][1]) break elif(tag == \'a\'): for attr in attrs: if (attr[0] == \'id\' and attr[1] == \'pageNext\'): self.nextUrl = attrs[1][1] print \'nextUrl:\',self.nextUrl break #首页Hmtl解析器 indexParser = IndexHtmlParser() #内页Html解析器 imageParser = ImageHtmlParser() #根据首页得到所有入口链接 print \'开始扫描首页...\' host = \'http://desk.zol.com.cn\' indexUrl = \'/meinv/\' while (indexUrl != \'\'): print \'正在抓取网页:\',host+indexUrl request = urllib2.Request(host+indexUrl) try: m = urllib2.urlopen(request) con = m.read() indexParser.feed(con) if (indexUrl == indexParser.nextUrl): break else: indexUrl = indexParser.nextUrl except urllib2.URLError,e: print e.reason print \'首页扫描完成,所有图集链接已获得:\' htmlDoorList = indexParser.urlList #根据入口链接得到所有图片的url class getImageUrl(threading.Thread): def __init__(self): threading.Thread.__init__(self) def run(self): for door in htmlDoorList: print \'开始获取图片地址,入口地址为:\',door global nextHtmlUrl nextHtmlUrl = \'\' while(door != \'\'): print \'开始从网页%s获取图片...\'% (host+door) if(nextHtmlUrl != \'\'): request = urllib2.Request(host+nextHtmlUrl) else: request = urllib2.Request(host+door) try: m = urllib2.urlopen(request) con = m.read() imageParser.feed(con) print \'下一个页面地址为:\',nextHtmlUrl if(door == nextHtmlUrl): break except urllib2.URLError,e: print e.reason print \'所有图片地址均已获得:\',imageUrlList class getImage(threading.Thread): def __init__(self): threading.Thread.__init__(self) def run(self): global imageUrlList print \'开始下载图片...\' while(True): print \'目前捕获图片数量:\',imageGetCount print \'已下载图片数量:\',imageDownloadCount image = imageUrlList.get() print \'下载文件路径:\',image try: cont = urllib2.urlopen(image).read() patter = \'[0-9]*\\.jpg\'; match = re.search(patter,image); if match: print \'正在下载文件:\',match.group() filename = localSavePath+match.group() f = open(filename,\'wb\') f.write(cont) f.close() global imageDownloadCount imageDownloadCount = imageDownloadCount + 1 else: print \'no match\' if(imageUrlList.empty()): break except urllib2.URLError,e: print e.reason print \'文件全部下载完成...\' get = getImageUrl() get.start() print \'获取图片链接线程启动:\' time.sleep(2) download = getImage() download.start() print \'下载图片链接线程启动:\'
批量抓取指定网页上的所有图片
# -*- coding:utf-8 -*-
# coding=UTF-8
import os,urllib,urllib2,re
url = u\"http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=python&oq=python&rsp=-1\"
outpath = \"t:\\\\\"
def getHtml(url):
webfile = urllib.urlopen(url)
outhtml = webfile.read()
print outhtml
return outhtml
def getImageList(html):
restr=ur\'(\'
restr+=ur\'http:\\/\\/[^\\s,\"]*\\.jpg\'
restr+=ur\'|http:\\/\\/[^\\s,\"]*\\.jpeg\'
restr+=ur\'|http:\\/\\/[^\\s,\"]*\\.png\'
restr+=ur\'|http:\\/\\/[^\\s,\"]*\\.gif\'
restr+=ur\'|http:\\/\\/[^\\s,\"]*\\.bmp\'
restr+=ur\'|https:\\/\\/[^\\s,\"]*\\.jpeg\'
restr+=ur\'|https:\\/\\/[^\\s,\"]*\\.jpeg\'
restr+=ur\'|https:\\/\\/[^\\s,\"]*\\.png\'
restr+=ur\'|https:\\/\\/[^\\s,\"]*\\.gif\'
restr+=ur\'|https:\\/\\/[^\\s,\"]*\\.bmp\'
restr+=ur\')\'
htmlurl = re.compile(restr)
imgList = re.findall(htmlurl,html)
print imgList
return imgList
def download(imgList, page):
x = 1
for imgurl in imgList:
filepathname=str(outpath+\'pic_%09d_%010d\'%(page,x)+str(os.path.splitext(urllib2.unquote(imgurl).decode(\'utf8\').split(\'/\')[-1])[1])).lower()
print \'[Debug] Download file :\'+ imgurl+\' >> \'+filepathname
urllib.urlretrieve(imgurl,filepathname)
x+=1
def downImageNum(pagenum):
page = 1
pageNumber = pagenum
while(page <= pageNumber):
html = getHtml(url)#获得url指向的html内容
imageList = getImageList(html)#获得所有图片的地址,返回列表
download(imageList,page)#下载所有的图片
page = page+1
if __name__ == \'__main__\':
downImageNum(1)
以上就是给大家汇总的3款Python实现的批量抓取妹纸图片的代码了,希望对大家学习Python爬虫能够有所帮助。