在上篇文章给大家分享PHP源码批量抓取远程网页图片并保存到本地的实现方法,感兴趣的朋友可以点击了解详情。
#-*-coding:utf-8-*-
import os
import uuid
import urllib2
import cookielib
\'\'\'获取文件后缀名\'\'\'
def get_file_extension(file):
return os.path.splitext(file)[1]
\'\'\'創建文件目录,并返回该目录\'\'\'
def mkdir(path):
# 去除左右两边的空格
path=path.strip()
# 去除尾部 \\符号
path=path.rstrip(\"\\\\\")
if not os.path.exists(path):
os.makedirs(path)
return path
\'\'\'自动生成一个唯一的字符串,固定长度为36\'\'\'
def unique_str():
return str(uuid.uuid1())
\'\'\'
抓取网页文件内容,保存到内存
@url 欲抓取文件 ,path+filename
\'\'\'
def get_file(url):
try:
cj=cookielib.LWPCookieJar()
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
req=urllib2.Request(url)
operate=opener.open(req)
data=operate.read()
return data
except BaseException, e:
print e
return None
\'\'\'
保存文件到本地
@path 本地路径
@file_name 文件名
@data 文件内容
\'\'\'
def save_file(path, file_name, data):
if data == None:
return
mkdir(path)
if(not path.endswith(\"/\")):
path=path+\"/\"
file=open(path+file_name, \"wb\")
file.write(data)
file.flush()
file.close()
#获取文件后缀名
print get_file_extension(\"123.jpg\");
#創建文件目录,并返回该目录
#print mkdir(\"d:/ljq\")
#自动生成一个唯一的字符串,固定长度为36
print unique_str()
url=\"http://qlogo1.store.qq.com/qzone/416501600/416501600/100?0\";
save_file(\"d:/ljq/\", \"123.jpg\", get_file(url))
通过Python抓取指定Url中的图片保存至本地
# *** encoding: utf-8 ***
__author__=\'jiangyt\'
\"\"\"
fetch images from specific url
v1.0
\"\"\"
import urllib, httplib, urlparse
import re
import random
\"\"\"judge url exists or not\"\"\"
def httpExists(url):
host, path = urlparse.urlsplit(url)[1:3]
if \':\' in host:
# port specified, try to use it
host, port = host.split(\':\', 1)
try:
port = int(port)
except ValueError:
print \'invalid port number %r\' % (port,)
return False
else:
# no port specified, use default port
port = None
try:
connection = httplib.HTTPConnection(host, port=port)
connection.request(\"HEAD\", path)
resp = connection.getresponse( )
if resp.status == 200: # normal \'found\' status
found = True
elif resp.status == 302: # recurse on temporary redirect
found = httpExists(urlparse.urljoin(url,resp.getheader(\'location\', \'\')))
else: # everything else -> not found
print \"Status %d %s : %s\" % (resp.status, resp.reason, url)
found = False
except Exception, e:
print e.__class__, e, url
found = False
return found
\"\"\"get html src,return lines[]\"\"\"
def gGetHtmlLines(url):
if url==None : return
if not httpExists(url): return
try:
page = urllib.urlopen(url)
html = page.readlines()
page.close()
return html
except Exception, e:
print \"gGetHtmlLines() error! Exception ==>>\" + e
return
\"\"\"get html src,return string\"\"\"
def gGetHtml(url):
if url==None : return
if not httpExists(url): return
try:
page = urllib.urlopen(url)
html = page.read()
page.close()
return html
except Exception, e:
print \"gGetHtml() error! Exception ==>>\" + e
return
\"\"\"根据url获取文件名\"\"\"
def gGetFileName(url):
if url==None: return None
if url==\"\" : return \"\"
arr=url.split(\"/\")
return arr[len(arr)-1]
\"\"\"生成随机文件名\"\"\"
def gRandFilename(type):
fname = \'\'
for i in range(16):
fname = fname + chr(random.randint(65,90))
fname = fname + chr(random.randint(48,57))
return fname + \'.\' + type
\"\"\"根据url和其上的link,得到link的绝对地址\"\"\"
def gGetAbslLink(url,link):
if url==None or link == None : return
if url==\'\' or link==\'\' : return url
addr = \'\'
if link[0] == \'/\' :
addr = gGetHttpAddr(url) + link
elif len(link)>3 and link[0:4] == \'http\':
addr = link
elif len(link)>2 and link[0:2] == \'..\':
addr = gGetHttpAddrFatherAssign(url,link)
else:
addr = gGetHttpAddrFather(url) + link
return addr
\"\"\"根据输入的lines,匹配正则表达式,返回list\"\"\"
def gGetRegList(linesList,regx):
if linesList==None : return
rtnList=[]
for line in linesList:
matchs = re.search(regx, line, re.IGNORECASE)
if matchs!=None:
allGroups = matchs.groups()
for foundStr in allGroups:
if foundStr not in rtnList:
rtnList.append(foundStr)
return rtnList
\"\"\"根据url下载文件,文件名参数指定\"\"\"
def gDownloadWithFilename(url,savePath,file):
#参数检查,现忽略
try:
urlopen=urllib.URLopener()
fp = urlopen.open(url)
data = fp.read()
fp.close()
file=open(savePath + file,\'w+b\')
file.write(data)
file.close()
except IOError, error:
print \"DOWNLOAD %s ERROR!==>>%s\" % (url, error)
except Exception, e:
print \"Exception==>>\" + e
\"\"\"根据url下载文件,文件名自动从url获取\"\"\"
def gDownload(url,savePath):
#参数检查,现忽略
fileName = gGetFileName(url)
#fileName =gRandFilename(\'jpg\')
gDownloadWithFilename(url,savePath,fileName)
\"\"\"根据某网页的url,下载该网页的jpg\"\"\"
def gDownloadHtmlJpg(downloadUrl,savePath):
lines= gGetHtmlLines(downloadUrl) # \'get the page source\'
regx = r\"\"\"src\\s*=\"?(\\S+)\\.jpg\"\"\"
lists =gGetRegList(lines,regx) #\'get the links which match regular express\'
if lists==None: return
for jpg in lists:
jpg = gGetAbslLink(downloadUrl, jpg) + \'.jpg\'
gDownload(jpg,savePath)
print gGetFileName(jpg)
\"\"\"根据url取主站地址\"\"\"
def gGetHttpAddr(url):
if url== \'\' : return \'\'
arr=url.split(\"/\")
return arr[0]+\"//\"+arr[2]
\"\"\"根据url取上级目录\"\"\"
def gGetHttpAddrFather(url):
if url==\'\' : return \'\'
arr=url.split(\"/\")
addr = arr[0]+\'//\'+arr[2]+ \'/\'
if len(arr)-1>3 :
for i in range(3,len(arr)-1):
addr = addr + arr[i] + \'/\'
return addr
\"\"\"根据url和上级的link取link的绝对地址\"\"\"
def gGetHttpAddrFatherAssign(url,link):
if url==\'\' : return \'\'
if link==\'\': return \'\'
linkArray=link.split(\"/\")
urlArray = url.split(\"/\")
partLink =\'\'
partUrl = \'\'
for i in range(len(linkArray)):
if linkArray[i]==\'..\':
numOfFather = i + 1 #上级数
else:
partLink = partLink + \'/\' + linkArray[i]
for i in range(len(urlArray)-1-numOfFather):
partUrl = partUrl + urlArray[i]
if i < len(urlArray)-1-numOfFather -1 :
partUrl = partUrl + \'/\'
return partUrl + partLink
\"\"\"根据url获取其上的相关htm、html链接,返回list\"\"\"
def gGetHtmlLink(url):
#参数检查,现忽略
rtnList=[]
lines=gGetHtmlLines(url)
regx = r\"\"\"href=\"?(\\S+)\\.htm\"\"\"
for link in gGetRegList(lines,regx):
link = gGetAbslLink(url,link) + \'.htm\'
if link not in rtnList:
rtnList.append(link)
print link
return rtnList
\"\"\"根据url,抓取其上的jpg和其链接htm上的jpg\"\"\"
def gDownloadAllJpg(url,savePath):
#参数检查,现忽略
gDownloadHtmlJpg(url,savePath)
#抓取link上的jpg
links=gGetHtmlLink(url)
for link in links:
gDownloadHtmlJpg(link,savePath)
\"\"\"test\"\"\"
def main():
u=\'http://site.douban.com/196738/room/2462453/\'#想要抓取图片的地址
save=\'/root/python/tmp/\' #图片所要存放的目录
print \'download pic from [\' + u +\']\'
print \'save to [\' +save+\'] ...\'
gDownloadHtmlJpg(u,save)
print \"download finished\"
if __name__ == \"__main__\":
main()
else:
print \"called from intern.\"
以上代码是小编给大家介绍的python抓取网页中图片并保存到本地的全部内容,希望大家喜欢。