在上篇文章给大家分享PHP源码批量抓取远程网页图片并保存到本地的实现方法,感兴趣的朋友可以点击了解详情。
#-*-coding:utf-8-*- import os import uuid import urllib2 import cookielib \'\'\'获取文件后缀名\'\'\' def get_file_extension(file): return os.path.splitext(file)[1] \'\'\'創建文件目录,并返回该目录\'\'\' def mkdir(path): # 去除左右两边的空格 path=path.strip() # 去除尾部 \\符号 path=path.rstrip(\"\\\\\") if not os.path.exists(path): os.makedirs(path) return path \'\'\'自动生成一个唯一的字符串,固定长度为36\'\'\' def unique_str(): return str(uuid.uuid1()) \'\'\' 抓取网页文件内容,保存到内存 @url 欲抓取文件 ,path+filename \'\'\' def get_file(url): try: cj=cookielib.LWPCookieJar() opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) req=urllib2.Request(url) operate=opener.open(req) data=operate.read() return data except BaseException, e: print e return None \'\'\' 保存文件到本地 @path 本地路径 @file_name 文件名 @data 文件内容 \'\'\' def save_file(path, file_name, data): if data == None: return mkdir(path) if(not path.endswith(\"/\")): path=path+\"/\" file=open(path+file_name, \"wb\") file.write(data) file.flush() file.close() #获取文件后缀名 print get_file_extension(\"123.jpg\"); #創建文件目录,并返回该目录 #print mkdir(\"d:/ljq\") #自动生成一个唯一的字符串,固定长度为36 print unique_str() url=\"http://qlogo1.store.qq.com/qzone/416501600/416501600/100?0\"; save_file(\"d:/ljq/\", \"123.jpg\", get_file(url))
通过Python抓取指定Url中的图片保存至本地
# *** encoding: utf-8 *** __author__=\'jiangyt\' \"\"\" fetch images from specific url v1.0 \"\"\" import urllib, httplib, urlparse import re import random \"\"\"judge url exists or not\"\"\" def httpExists(url): host, path = urlparse.urlsplit(url)[1:3] if \':\' in host: # port specified, try to use it host, port = host.split(\':\', 1) try: port = int(port) except ValueError: print \'invalid port number %r\' % (port,) return False else: # no port specified, use default port port = None try: connection = httplib.HTTPConnection(host, port=port) connection.request(\"HEAD\", path) resp = connection.getresponse( ) if resp.status == 200: # normal \'found\' status found = True elif resp.status == 302: # recurse on temporary redirect found = httpExists(urlparse.urljoin(url,resp.getheader(\'location\', \'\'))) else: # everything else -> not found print \"Status %d %s : %s\" % (resp.status, resp.reason, url) found = False except Exception, e: print e.__class__, e, url found = False return found \"\"\"get html src,return lines[]\"\"\" def gGetHtmlLines(url): if url==None : return if not httpExists(url): return try: page = urllib.urlopen(url) html = page.readlines() page.close() return html except Exception, e: print \"gGetHtmlLines() error! Exception ==>>\" + e return \"\"\"get html src,return string\"\"\" def gGetHtml(url): if url==None : return if not httpExists(url): return try: page = urllib.urlopen(url) html = page.read() page.close() return html except Exception, e: print \"gGetHtml() error! Exception ==>>\" + e return \"\"\"根据url获取文件名\"\"\" def gGetFileName(url): if url==None: return None if url==\"\" : return \"\" arr=url.split(\"/\") return arr[len(arr)-1] \"\"\"生成随机文件名\"\"\" def gRandFilename(type): fname = \'\' for i in range(16): fname = fname + chr(random.randint(65,90)) fname = fname + chr(random.randint(48,57)) return fname + \'.\' + type \"\"\"根据url和其上的link,得到link的绝对地址\"\"\" def gGetAbslLink(url,link): if url==None or link == None : return if url==\'\' or link==\'\' : return url addr = \'\' if link[0] == \'/\' : addr = gGetHttpAddr(url) + link elif len(link)>3 and link[0:4] == \'http\': addr = link elif len(link)>2 and link[0:2] == \'..\': addr = gGetHttpAddrFatherAssign(url,link) else: addr = gGetHttpAddrFather(url) + link return addr \"\"\"根据输入的lines,匹配正则表达式,返回list\"\"\" def gGetRegList(linesList,regx): if linesList==None : return rtnList=[] for line in linesList: matchs = re.search(regx, line, re.IGNORECASE) if matchs!=None: allGroups = matchs.groups() for foundStr in allGroups: if foundStr not in rtnList: rtnList.append(foundStr) return rtnList \"\"\"根据url下载文件,文件名参数指定\"\"\" def gDownloadWithFilename(url,savePath,file): #参数检查,现忽略 try: urlopen=urllib.URLopener() fp = urlopen.open(url) data = fp.read() fp.close() file=open(savePath + file,\'w+b\') file.write(data) file.close() except IOError, error: print \"DOWNLOAD %s ERROR!==>>%s\" % (url, error) except Exception, e: print \"Exception==>>\" + e \"\"\"根据url下载文件,文件名自动从url获取\"\"\" def gDownload(url,savePath): #参数检查,现忽略 fileName = gGetFileName(url) #fileName =gRandFilename(\'jpg\') gDownloadWithFilename(url,savePath,fileName) \"\"\"根据某网页的url,下载该网页的jpg\"\"\" def gDownloadHtmlJpg(downloadUrl,savePath): lines= gGetHtmlLines(downloadUrl) # \'get the page source\' regx = r\"\"\"src\\s*=\"?(\\S+)\\.jpg\"\"\" lists =gGetRegList(lines,regx) #\'get the links which match regular express\' if lists==None: return for jpg in lists: jpg = gGetAbslLink(downloadUrl, jpg) + \'.jpg\' gDownload(jpg,savePath) print gGetFileName(jpg) \"\"\"根据url取主站地址\"\"\" def gGetHttpAddr(url): if url== \'\' : return \'\' arr=url.split(\"/\") return arr[0]+\"//\"+arr[2] \"\"\"根据url取上级目录\"\"\" def gGetHttpAddrFather(url): if url==\'\' : return \'\' arr=url.split(\"/\") addr = arr[0]+\'//\'+arr[2]+ \'/\' if len(arr)-1>3 : for i in range(3,len(arr)-1): addr = addr + arr[i] + \'/\' return addr \"\"\"根据url和上级的link取link的绝对地址\"\"\" def gGetHttpAddrFatherAssign(url,link): if url==\'\' : return \'\' if link==\'\': return \'\' linkArray=link.split(\"/\") urlArray = url.split(\"/\") partLink =\'\' partUrl = \'\' for i in range(len(linkArray)): if linkArray[i]==\'..\': numOfFather = i + 1 #上级数 else: partLink = partLink + \'/\' + linkArray[i] for i in range(len(urlArray)-1-numOfFather): partUrl = partUrl + urlArray[i] if i < len(urlArray)-1-numOfFather -1 : partUrl = partUrl + \'/\' return partUrl + partLink \"\"\"根据url获取其上的相关htm、html链接,返回list\"\"\" def gGetHtmlLink(url): #参数检查,现忽略 rtnList=[] lines=gGetHtmlLines(url) regx = r\"\"\"href=\"?(\\S+)\\.htm\"\"\" for link in gGetRegList(lines,regx): link = gGetAbslLink(url,link) + \'.htm\' if link not in rtnList: rtnList.append(link) print link return rtnList \"\"\"根据url,抓取其上的jpg和其链接htm上的jpg\"\"\" def gDownloadAllJpg(url,savePath): #参数检查,现忽略 gDownloadHtmlJpg(url,savePath) #抓取link上的jpg links=gGetHtmlLink(url) for link in links: gDownloadHtmlJpg(link,savePath) \"\"\"test\"\"\" def main(): u=\'http://site.douban.com/196738/room/2462453/\'#想要抓取图片的地址 save=\'/root/python/tmp/\' #图片所要存放的目录 print \'download pic from [\' + u +\']\' print \'save to [\' +save+\'] ...\' gDownloadHtmlJpg(u,save) print \"download finished\" if __name__ == \"__main__\": main() else: print \"called from intern.\"
以上代码是小编给大家介绍的python抓取网页中图片并保存到本地的全部内容,希望大家喜欢。