Python实现多线程抓取妹子图
admin
2023-07-31 02:36:08
0

心血来潮写了个多线程抓妹子图,虽然代码还是有一些瑕疵,但是还是记录下来,分享给大家。

Pic_downloader.py

# -*- coding: utf-8 -*-
\"\"\"
Created on Fri Aug 07 17:30:58 2015
 
@author: Dreace
\"\"\"
import urllib2
import sys
import time
import os
import random
from multiprocessing.dummy import Pool as ThreadPool 
type_ = sys.getfilesystemencoding()
def rename():
  return time.strftime(\"%Y%m%d%H%M%S\")
def rename_2(name): 
  if len(name) == 2: 
    name = \'0\' + name + \'.jpg\' 
  elif len(name) == 1: 
    name = \'00\' + name + \'.jpg\' 
  else: 
    name = name + \'.jpg\' 
  return name
def download_pic(i):
  global count
  global time_out
  if Filter(i):
    try: 
      content = urllib2.urlopen(i,timeout = time_out)
      url_content = content.read()
      f = open(repr(random.randint(10000,999999999)) + \"_\" + rename_2(repr(count)),\"wb\")
      f.write(url_content)
      f.close()
      count += 1
    except Exception, e:
      print i + \"下载超时,跳过!\".decode(\"utf-8\").encode(type_)
def Filter(content):
  for line in Filter_list:
    line=line.strip(\'\\n\')
    if content.find(line) == -1:
      return True
def get_pic(url_address):
  global pic_list
  try:
    str_ = urllib2.urlopen(url_address, timeout = time_out).read()
    url_content = str_.split(\"\\\"\")
    for i in url_content:
      if i.find(\".jpg\") != -1:
        pic_list.append(i)  
  except Exception, e:
    print \"获取图片超时,跳过!\".decode(\"utf-8\").encode(type_)
MAX = 2
count = 0
time_out = 60
thread_num = 30
pic_list = []
page_list = []
Filter_list = [\"imgsize.ph.126.net\",\"img.ph.126.net\",\"img2.ph.126.net\"]
dir_name = \"C:\\Photos\\\\\"+rename()
os.makedirs(dir_name)
os.chdir(dir_name)
start_time = time.time()
url_address = \"http://sexy.faceks.com/?page=\"
for i in range(1,MAX + 1): 
  page_list.append(url_address + repr(i))
page_pool = ThreadPool(thread_num)
page_pool.map(get_pic,page_list)
print \"获取到\".decode(\"utf-8\").encode(type_),len(pic_list),\"张图片,开始下载!\".decode(\"utf-8\").encode(type_)
pool = ThreadPool(thread_num) 
pool.map(download_pic,pic_list)
pool.close() 
pool.join()
print count,\"张图片保存在\".decode(\"utf-8\").encode(type_) + dir_name
print \"共耗时\".decode(\"utf-8\").encode(type_),time.time() - start_time,\"s\"

我们来看下一个网友的作品

#coding: utf-8 #############################################################
# File Name: main.py
# Author: mylonly
# mail: mylonly@gmail.com
# Created Time: Wed 11 Jun 2014 08:22:12 PM CST
#########################################################################
#!/usr/bin/python

import re,urllib2,HTMLParser,threading,Queue,time

#各图集入口链接
htmlDoorList = []
#包含图片的Hmtl链接
htmlUrlList = []
#图片Url链接Queue
imageUrlList = Queue.Queue(0)
#捕获图片数量
imageGetCount = 0
#已下载图片数量
imageDownloadCount = 0
#每个图集的起始地址,用于判断终止
nextHtmlUrl = \'\'
#本地保存路径
localSavePath = \'/data/1920x1080/\'

#如果你想下你需要的分辨率的,请修改replace_str,有如下分辨率可供选择1920x1200,1980x1920,1680x1050,1600x900,1440x900,1366x768,1280x1024,1024x768,1280x800
replace_str = \'1920x1080\'

replaced_str = \'960x600\'

#内页分析处理类
class ImageHtmlParser(HTMLParser.HTMLParser):
def __init__(self):
self.nextUrl = \'\'
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
global imageUrlList
if(tag == \'img\' and len(attrs) > 2 ):
if(attrs[0] == (\'id\',\'bigImg\')):
url = attrs[1][1]
url = url.replace(replaced_str,replace_str)
imageUrlList.put(url)
global imageGetCount
imageGetCount = imageGetCount + 1
print url
elif(tag == \'a\' and len(attrs) == 4):
if(attrs[0] == (\'id\',\'pageNext\') and attrs[1] == (\'class\',\'next\')):
global nextHtmlUrl
nextHtmlUrl = attrs[2][1];

#首页分析类
class IndexHtmlParser(HTMLParser.HTMLParser):
def __init__(self):
self.urlList = []
self.index = 0
self.nextUrl = \'\'
self.tagList = [\'li\',\'a\']
self.classList = [\'photo-list-padding\',\'pic\']
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
if(tag == self.tagList[self.index]):
for attr in attrs:
if (attr[1] == self.classList[self.index]):
if(self.index == 0):
#第一层找到了
self.index = 1
else:
#第二层找到了
self.index = 0
print attrs[1][1]
self.urlList.append(attrs[1][1])
break
elif(tag == \'a\'):
for attr in attrs:
if (attr[0] == \'id\' and attr[1] == \'pageNext\'):
self.nextUrl = attrs[1][1]
print \'nextUrl:\',self.nextUrl
break

#首页Hmtl解析器
indexParser = IndexHtmlParser()
#内页Html解析器
imageParser = ImageHtmlParser()

#根据首页得到所有入口链接
print \'开始扫描首页...\'
host = \'http://desk.zol.com.cn\'
indexUrl = \'/meinv/\'
while (indexUrl != \'\'):
print \'正在抓取网页:\',host+indexUrl
request = urllib2.Request(host+indexUrl)
try:
m = urllib2.urlopen(request)
con = m.read()
indexParser.feed(con)
if (indexUrl == indexParser.nextUrl):
break
else:
indexUrl = indexParser.nextUrl
except urllib2.URLError,e:
print e.reason

print \'首页扫描完成,所有图集链接已获得:\'
htmlDoorList = indexParser.urlList

#根据入口链接得到所有图片的url
class getImageUrl(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
for door in htmlDoorList:
print \'开始获取图片地址,入口地址为:\',door
global nextHtmlUrl
nextHtmlUrl = \'\'
while(door != \'\'):
print \'开始从网页%s获取图片...\'% (host+door)
if(nextHtmlUrl != \'\'):
request = urllib2.Request(host+nextHtmlUrl)
else:
request = urllib2.Request(host+door)
try:
m = urllib2.urlopen(request)
con = m.read()
imageParser.feed(con)
print \'下一个页面地址为:\',nextHtmlUrl
if(door == nextHtmlUrl):
break
except urllib2.URLError,e:
print e.reason
print \'所有图片地址均已获得:\',imageUrlList

class getImage(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
global imageUrlList
print \'开始下载图片...\'
while(True):
print \'目前捕获图片数量:\',imageGetCount
print \'已下载图片数量:\',imageDownloadCount
image = imageUrlList.get()
print \'下载文件路径:\',image
try:
cont = urllib2.urlopen(image).read()
patter = \'[0-9]*\\.jpg\';
match = re.search(patter,image);
if match:
print \'正在下载文件:\',match.group()
filename = localSavePath+match.group()
f = open(filename,\'wb\')
f.write(cont)
f.close()
global imageDownloadCount
imageDownloadCount = imageDownloadCount + 1
else:
print \'no match\'
if(imageUrlList.empty()):
break
except urllib2.URLError,e:
print e.reason
print \'文件全部下载完成...\'

get = getImageUrl()
get.start()
print \'获取图片链接线程启动:\'

time.sleep(2)

download = getImage()
download.start()
print \'下载图片链接线程启动:\'

批量抓取指定网页上的所有图片

# -*- coding:utf-8 -*-
# coding=UTF-8
 
import os,urllib,urllib2,re
 
url = u\"http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=python&oq=python&rsp=-1\"
outpath = \"t:\\\\\"
 
def getHtml(url):
  webfile = urllib.urlopen(url)
  outhtml = webfile.read()
  print outhtml
  return outhtml
 
def getImageList(html):
  restr=ur\'(\'
  restr+=ur\'http:\\/\\/[^\\s,\"]*\\.jpg\'
  restr+=ur\'|http:\\/\\/[^\\s,\"]*\\.jpeg\'
  restr+=ur\'|http:\\/\\/[^\\s,\"]*\\.png\'
  restr+=ur\'|http:\\/\\/[^\\s,\"]*\\.gif\'
  restr+=ur\'|http:\\/\\/[^\\s,\"]*\\.bmp\'
  restr+=ur\'|https:\\/\\/[^\\s,\"]*\\.jpeg\'  
  restr+=ur\'|https:\\/\\/[^\\s,\"]*\\.jpeg\'
  restr+=ur\'|https:\\/\\/[^\\s,\"]*\\.png\'
  restr+=ur\'|https:\\/\\/[^\\s,\"]*\\.gif\'
  restr+=ur\'|https:\\/\\/[^\\s,\"]*\\.bmp\'
  restr+=ur\')\'
  htmlurl = re.compile(restr)
  imgList = re.findall(htmlurl,html)
  print imgList
  return imgList
 
def download(imgList, page):
  x = 1
  for imgurl in imgList:
    filepathname=str(outpath+\'pic_%09d_%010d\'%(page,x)+str(os.path.splitext(urllib2.unquote(imgurl).decode(\'utf8\').split(\'/\')[-1])[1])).lower()
    print \'[Debug] Download file :\'+ imgurl+\' >> \'+filepathname
    urllib.urlretrieve(imgurl,filepathname)
    x+=1
 
def downImageNum(pagenum):
  page = 1
  pageNumber = pagenum
  while(page <= pageNumber):
    html = getHtml(url)#获得url指向的html内容
    imageList = getImageList(html)#获得所有图片的地址,返回列表
    download(imageList,page)#下载所有的图片
    page = page+1
 
if __name__ == \'__main__\':
  downImageNum(1)

以上就是给大家汇总的3款Python实现的批量抓取妹纸图片的代码了,希望对大家学习Python爬虫能够有所帮助。

相关内容

热门资讯

Mobi、epub格式电子书如... 在wps里全局设置里有一个文件关联,打开,勾选电子书文件选项就可以了。
500 行 Python 代码... 语法分析器描述了一个句子的语法结构,用来帮助其他的应用进行推理。自然语言引入了很多意外的歧义,以我们...
定时清理删除C:\Progra... C:\Program Files (x86)下面很多scoped_dir开头的文件夹 写个批处理 定...
scoped_dir32_70... 一台虚拟机C盘总是莫名奇妙的空间用完,导致很多软件没法再运行。经过仔细检查发现是C:\Program...
65536是2的几次方 计算2... 65536是2的16次方:65536=2⁶ 65536是256的2次方:65536=256 6553...
小程序支付时提示:appid和... [Q]小程序支付时提示:appid和mch_id不匹配 [A]小程序和微信支付没有进行关联,访问“小...
pycparser 是一个用... `pycparser` 是一个用 Python 编写的 C 语言解析器。它可以用来解析 C 代码并构...
微信小程序使用slider实现... 众所周知哈,微信小程序里面的音频播放是没有进度条的,但最近有个项目呢,客户要求音频要有进度条控制,所...
Apache Doris 2.... 亲爱的社区小伙伴们,我们很高兴地向大家宣布,Apache Doris 2.0.0 版本已于...
python清除字符串里非数字... 本文实例讲述了python清除字符串里非数字字符的方法。分享给大家供大家参考。具体如下: impor...