python抓取网页中图片并保存到本地
admin
2023-07-31 02:33:21
0

在上篇文章给大家分享PHP源码批量抓取远程网页图片并保存到本地的实现方法,感兴趣的朋友可以点击了解详情。

#-*-coding:utf-8-*- 
import os
import uuid
import urllib2
import cookielib
\'\'\'获取文件后缀名\'\'\'
def get_file_extension(file): 
  return os.path.splitext(file)[1] 
\'\'\'創建文件目录,并返回该目录\'\'\'
def mkdir(path):
  # 去除左右两边的空格
  path=path.strip()
  # 去除尾部 \\符号
  path=path.rstrip(\"\\\\\")
  if not os.path.exists(path):
    os.makedirs(path)
  return path
\'\'\'自动生成一个唯一的字符串,固定长度为36\'\'\'
def unique_str():
  return str(uuid.uuid1())
\'\'\'
抓取网页文件内容,保存到内存
@url 欲抓取文件 ,path+filename
\'\'\'
def get_file(url):
  try:
    cj=cookielib.LWPCookieJar()
    opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)
    req=urllib2.Request(url)
    operate=opener.open(req)
    data=operate.read()
    return data
  except BaseException, e:
    print e
    return None
\'\'\'
保存文件到本地
@path 本地路径
@file_name 文件名
@data 文件内容
\'\'\'
def save_file(path, file_name, data):
  if data == None:
    return
  mkdir(path)
  if(not path.endswith(\"/\")):
    path=path+\"/\"
  file=open(path+file_name, \"wb\")
  file.write(data)
  file.flush()
  file.close()
#获取文件后缀名
print get_file_extension(\"123.jpg\");
#創建文件目录,并返回该目录
#print mkdir(\"d:/ljq\")
#自动生成一个唯一的字符串,固定长度为36
print unique_str()
url=\"http://qlogo1.store.qq.com/qzone/416501600/416501600/100?0\";
save_file(\"d:/ljq/\", \"123.jpg\", get_file(url))

通过Python抓取指定Url中的图片保存至本地

# *** encoding: utf-8 ***
__author__=\'jiangyt\'
\"\"\" 
fetch images from specific url
v1.0
\"\"\" 
import urllib, httplib, urlparse 
import re 
import random 
\"\"\"judge url exists or not\"\"\" 
def httpExists(url): 
  host, path = urlparse.urlsplit(url)[1:3] 
  if \':\' in host: 
    # port specified, try to use it 
    host, port = host.split(\':\', 1) 
    try: 
      port = int(port) 
    except ValueError: 
      print \'invalid port number %r\' % (port,) 
      return False 
  else: 
    # no port specified, use default port 
    port = None 
  try: 
    connection = httplib.HTTPConnection(host, port=port) 
    connection.request(\"HEAD\", path) 
    resp = connection.getresponse( ) 
    if resp.status == 200: # normal \'found\' status 
      found = True 
    elif resp.status == 302: # recurse on temporary redirect 
      found = httpExists(urlparse.urljoin(url,resp.getheader(\'location\', \'\'))) 
    else: # everything else -> not found 
      print \"Status %d %s : %s\" % (resp.status, resp.reason, url) 
      found = False 
  except Exception, e: 
    print e.__class__, e, url 
    found = False 
  return found 
\"\"\"get html src,return lines[]\"\"\" 
def gGetHtmlLines(url): 
  if url==None : return 
  if not httpExists(url): return 
  try: 
    page = urllib.urlopen(url) 
    html = page.readlines() 
    page.close() 
    return html 
  except Exception, e: 
    print \"gGetHtmlLines() error! Exception ==>>\" + e 
    return 
\"\"\"get html src,return string\"\"\" 
def gGetHtml(url): 
  if url==None : return 
  if not httpExists(url): return 
  try: 
    page = urllib.urlopen(url) 
    html = page.read() 
    page.close() 
    return html 
  except Exception, e: 
    print \"gGetHtml() error! Exception ==>>\" + e 
    return 
\"\"\"根据url获取文件名\"\"\" 
def gGetFileName(url): 
  if url==None: return None 
  if url==\"\" : return \"\" 
  arr=url.split(\"/\") 
  return arr[len(arr)-1] 
\"\"\"生成随机文件名\"\"\" 
def gRandFilename(type): 
  fname = \'\' 
  for i in range(16): 
    fname = fname + chr(random.randint(65,90)) 
    fname = fname + chr(random.randint(48,57)) 
  return fname + \'.\' + type 
\"\"\"根据url和其上的link,得到link的绝对地址\"\"\" 
def gGetAbslLink(url,link): 
  if url==None or link == None : return 
  if url==\'\' or link==\'\' : return url 
  addr = \'\' 
  if link[0] == \'/\' : 
    addr = gGetHttpAddr(url) + link 
  elif len(link)>3 and link[0:4] == \'http\': 
    addr = link 
  elif len(link)>2 and link[0:2] == \'..\': 
    addr = gGetHttpAddrFatherAssign(url,link) 
  else: 
    addr = gGetHttpAddrFather(url) + link 
  return addr 
\"\"\"根据输入的lines,匹配正则表达式,返回list\"\"\" 
def gGetRegList(linesList,regx): 
  if linesList==None : return 
  rtnList=[] 
  for line in linesList: 
    matchs = re.search(regx, line, re.IGNORECASE) 
    if matchs!=None: 
      allGroups = matchs.groups() 
      for foundStr in allGroups: 
        if foundStr not in rtnList: 
          rtnList.append(foundStr) 
  return rtnList 
\"\"\"根据url下载文件,文件名参数指定\"\"\" 
def gDownloadWithFilename(url,savePath,file): 
  #参数检查,现忽略 
  try: 
    urlopen=urllib.URLopener() 
    fp = urlopen.open(url) 
    data = fp.read() 
    fp.close() 
    file=open(savePath + file,\'w+b\') 
    file.write(data) 
    file.close() 
  except IOError, error: 
    print \"DOWNLOAD %s ERROR!==>>%s\" % (url, error) 
  except Exception, e: 
    print \"Exception==>>\" + e 
\"\"\"根据url下载文件,文件名自动从url获取\"\"\" 
def gDownload(url,savePath): 
  #参数检查,现忽略 
  fileName = gGetFileName(url) 
  #fileName =gRandFilename(\'jpg\') 
  gDownloadWithFilename(url,savePath,fileName) 
\"\"\"根据某网页的url,下载该网页的jpg\"\"\" 
def gDownloadHtmlJpg(downloadUrl,savePath): 
  lines= gGetHtmlLines(downloadUrl) # \'get the page source\' 
  regx = r\"\"\"src\\s*=\"?(\\S+)\\.jpg\"\"\" 
  lists =gGetRegList(lines,regx) #\'get the links which match regular express\' 
  if lists==None: return 
  for jpg in lists: 
    jpg = gGetAbslLink(downloadUrl, jpg) + \'.jpg\' 
    gDownload(jpg,savePath) 
    print gGetFileName(jpg) 
\"\"\"根据url取主站地址\"\"\" 
def gGetHttpAddr(url): 
  if url== \'\' : return \'\' 
  arr=url.split(\"/\") 
  return arr[0]+\"//\"+arr[2] 
\"\"\"根据url取上级目录\"\"\" 
def gGetHttpAddrFather(url): 
  if url==\'\' : return \'\' 
  arr=url.split(\"/\") 
  addr = arr[0]+\'//\'+arr[2]+ \'/\' 
  if len(arr)-1>3 : 
    for i in range(3,len(arr)-1): 
      addr = addr + arr[i] + \'/\' 
  return addr 
\"\"\"根据url和上级的link取link的绝对地址\"\"\" 
def gGetHttpAddrFatherAssign(url,link): 
  if url==\'\' : return \'\' 
  if link==\'\': return \'\' 
  linkArray=link.split(\"/\") 
  urlArray = url.split(\"/\") 
  partLink =\'\' 
  partUrl = \'\' 
  for i in range(len(linkArray)): 
    if linkArray[i]==\'..\': 
      numOfFather = i + 1 #上级数 
    else: 
      partLink = partLink + \'/\' + linkArray[i] 
  for i in range(len(urlArray)-1-numOfFather): 
    partUrl = partUrl + urlArray[i] 
    if i < len(urlArray)-1-numOfFather -1 : 
      partUrl = partUrl + \'/\' 
  return partUrl + partLink 
\"\"\"根据url获取其上的相关htm、html链接,返回list\"\"\" 
def gGetHtmlLink(url): 
  #参数检查,现忽略 
  rtnList=[] 
  lines=gGetHtmlLines(url) 
  regx = r\"\"\"href=\"?(\\S+)\\.htm\"\"\" 
  for link in gGetRegList(lines,regx): 
    link = gGetAbslLink(url,link) + \'.htm\' 
    if link not in rtnList: 
      rtnList.append(link) 
      print link 
  return rtnList 
\"\"\"根据url,抓取其上的jpg和其链接htm上的jpg\"\"\" 
def gDownloadAllJpg(url,savePath): 
  #参数检查,现忽略 
  gDownloadHtmlJpg(url,savePath) 
  #抓取link上的jpg 
  links=gGetHtmlLink(url) 
  for link in links: 
    gDownloadHtmlJpg(link,savePath) 
\"\"\"test\"\"\" 
def main(): 
  u=\'http://site.douban.com/196738/room/2462453/\'#想要抓取图片的地址
  save=\'/root/python/tmp/\' #图片所要存放的目录
  print \'download pic from [\' + u +\']\' 
  print \'save to [\' +save+\'] ...\' 
  gDownloadHtmlJpg(u,save) 
  print \"download finished\" 
if __name__ == \"__main__\":
  main()
else:
  print \"called from intern.\"

以上代码是小编给大家介绍的python抓取网页中图片并保存到本地的全部内容,希望大家喜欢。

相关内容

热门资讯

Mobi、epub格式电子书如... 在wps里全局设置里有一个文件关联,打开,勾选电子书文件选项就可以了。
定时清理删除C:\Progra... C:\Program Files (x86)下面很多scoped_dir开头的文件夹 写个批处理 定...
scoped_dir32_70... 一台虚拟机C盘总是莫名奇妙的空间用完,导致很多软件没法再运行。经过仔细检查发现是C:\Program...
500 行 Python 代码... 语法分析器描述了一个句子的语法结构,用来帮助其他的应用进行推理。自然语言引入了很多意外的歧义,以我们...
小程序支付时提示:appid和... [Q]小程序支付时提示:appid和mch_id不匹配 [A]小程序和微信支付没有进行关联,访问“小...
pycparser 是一个用... `pycparser` 是一个用 Python 编写的 C 语言解析器。它可以用来解析 C 代码并构...
微信小程序使用slider实现... 众所周知哈,微信小程序里面的音频播放是没有进度条的,但最近有个项目呢,客户要求音频要有进度条控制,所...
65536是2的几次方 计算2... 65536是2的16次方:65536=2⁶ 65536是256的2次方:65536=256 6553...
Apache Doris 2.... 亲爱的社区小伙伴们,我们很高兴地向大家宣布,Apache Doris 2.0.0 版本已于...
项目管理和工程管理的区别 项目管理 项目管理,顾名思义就是专注于开发和完成项目的管理,以实现目标并满足成功标准和项目要求。 工...