python实现爬虫下载漫画示例
admin
2023-07-31 02:00:49
0

复制代码 代码如下:
#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2

weburl=\’\’
floder=\’\’
chapterbegin=0
currentthreadnum=0
threadcount=6

if len(sys.argv)>=3:
  weburl=sys.argv[1]
  floder=sys.argv[2]
else:
    print(\”usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6\”)
    sys.exit(0)
if len(sys.argv)>=4:
  chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
  threadcount=(int)(sys.argv[4])

 

def jin(i,jinzhi):
        finalans=\”\”
        answer=i%jinzhi
        i=int(i/jinzhi)
        if answer>9:
                finalans=finalans+chr(ord(\’a\’)+(answer-10))
        else:
                finalans=finalans+str(answer)
        if i!=0:
                finalans=jin(i,jinzhi)+finalans
        return finalans
def urlparse(p,a,c,k):
        d={}
        e=lambda c:     jin(c,36)
        if 1:
                while c:
                        c=c-1
                        if not k[c]:
                                d[jin(c,36)]=jin(c,36)
                        else:
                                d[jin(c,36)]=k[c]
                k=[lambda e:d[e]]
                e=lambda c:\’\\\\w+\’
                c=1
        newstr=\”\”
        while c:
                c=c-1
                if k[c]:
                        for i in range(0,len(p)):
                                tempi=p[i]
                                tempi=ord(tempi)
                                if tempi>=ord(\’a\’) and tempi<=ord(\’f\’):
                                        newstr+=d[chr(tempi)]
                                elif tempi>=ord(\’0\’) and tempi<=ord(\’9\’):
                                        newstr+=d[chr(tempi)]
                                else:
                                        newstr+=chr(tempi)
        return newstr
def meispower(s):
        p=re.compile(r\”(?=\\}\\().*\”,re.IGNORECASE)
        s=p.findall(s)
        s=s[0]
        s=s[0:(len(s)-19)]
        par=s.split(\’,\’)
        par[3]=par[3][1:len(par[3])]
        answer=par[3].split(\’|\’)
        chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
        allurl=re.findall(\’imgpath=[^;]*\’,chapterpath)[0]
        allurl=allurl[10:(len(allurl)-2)]
        return allurl
def pictofile(weburl,filename,loop=100):
        if loop<0:
                print(\’can\\\’t download the picture %s\’%weburl)
                return
        loop=loop-1
        if os.path.exists(filename):
            return
        try:
                url=urllib.request.urlopen(weburl)
                data=url.read()
                if len(data)<2048:
                        url.close()
                        pictofile(weburl,filename,loop)
                else:
                        print(\’download from %s name is %s\\n\’%(weburl,filename))
                        myfile=open(\’%s\’%filename,\’wb\’)
                        myfile.write(data)
                        myfile.close()
                        url.close();
        except socket.timeout:
                print(\’timeout\’)
                pictofile(weburl,filename,loop)
        except Exception as e:
          print(\’error\’,e)
          pictofile(weburl,filename,loop)
        finally:
            pass
def downloadpic(url,loadpicdir,num):
    #download the all url picture to loadpicdir
    global currentthreadnum,mutex,mutex2
    mymode=re.compile(r\'[0-9a-z.]*\\Z\’)
    try:
                mutex2.acquire()
                os.chdir(loadpicdir)
                mutex2.release()
    except:
                print(\”can\’t open the floder %s will be create\”%loadpicdir)
                try:
                    if(mutex2.locked()):
                        os.mkdir(loadpicdir)
                        os.chdir(loadpicdir)
                        mutex2.release()
                    print(\’create floder succeed\’)
                except:
                    print(\”can\’t create floder %s\”%loadpicdir)
                    if(mutex.acquire()):
                        mutex.release()
                    quit(0)
    name=mymode.findall(url)
    filename=\’manhua\’+name[0]
    pictofile(url,loadpicdir+\’//\’+str(num)+\’-\’+filename)
    mutex.acquire()
    currentthreadnum=currentthreadnum-1
    mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
        global manhuaweb,threadcount,currentthreadnum,mutex
        print(manhuaweb+url)
        webdata=urllib.request.urlopen(manhuaweb+url).read()
        webdata=webdata.decode(\’UTF-8\’)
        chaptername=re.findall(r\'[^_]*\’,webdata)[0]<br />        chaptername=chaptername[7:len(chaptername)]<br />        webscrip=re.findall(r\’eval.*[^<>]\’,webdata)<br />        chapterurl=meispower(webscrip[0]);<br />        chapterurl=\’http://mhimg.ali213.net\’+chapterurl<br />        for i in range(begin,num):<br />                try:<br />                        while(currentthreadnum>=threadcount):<br />                                time.sleep(0.5)<br />                        mutex.acquire()<br />                        currentthreadnum=currentthreadnum+1<br />                        mutex.release()<br />                        threading.Thread(target=downloadpic,args=(r\’%s%d.jpg\’%(chapterurl,i),loadpicdir+chaptername,num)).start()<br />                except socket.error:<br />                        mutex.acquire()<br />                        i=i-1<br />                        currentthreadnum=currentthreadnum-1<br />                        mutex.release()<br />                except Exception as error:<br />                        print(error,\’break\’)<br />                        print(\’download chapter %d of picture make a error\’%i)<br />                        break<br />if __name__==\’__main__\’:<br />        manhuaweb=r\’http://manhua.ali213.net\’<br />        socket.setdefaulttimeout(60.0)<br />        mutex=threading.Lock()<br />        mutex2=threading.Lock()</p> <p>        <br />        webfile=urllib.request.urlopen(weburl)<br />        webdata=webfile.read();<br />        webdata=webdata.decode(\’UTF-8\’)<br />        meshmode=re.compile(r\'<div class=\”detail_body_right_sec_con\”>.*</div>\’)<br />        meshdata=meshmode.findall(webdata)[0]<br />        indexmode=re.compile(r\'([0-9]*页)\’)<br />        indexdata=indexmode.findall(meshdata)</p> <p>        picurlmode=re.compile(r\’/comic/[0-9/]*.html\’)<br />        picurldata=picurlmode.findall(meshdata)</p> <p>        chapterlength=len(picurldata)<br />        nummode=re.compile(r\'[\\d]+\’)</p> <p>        i=chapterbegin<br />        while i<chapterlength:<br />                manhuachapter=picurldata[chapterlength-i-1]<br />                downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))<br />                i=i+1</p> <!--end::Text--> </div> <!--end::Description--> <div class="mt-5"> <!--关键词搜索--> <a href="/index.php?s=article&c=search&keyword=%E7%88%AC%E8%99%AB" class="badge badge-light-primary fw-bold my-2" target="_blank">爬虫</a> <a href="/index.php?s=article&c=search&keyword=%E6%BC%AB%E7%94%BB" class="badge badge-light-primary fw-bold my-2" target="_blank">漫画</a> </div> <div class="mt-5"> <p class="fc-show-prev-next"> <strong>上一篇:</strong><a href="/program/41021.html">用smtplib和email封装python发送邮件模块类分享</a><br> </p> <p class="fc-show-prev-next"> <strong>下一篇:</strong><a href="/program/41023.html">python定时器使用示例分享</a> </p> </div> <!--begin::Block--> <div class="d-flex flex-stack mb-2 mt-10"> <!--begin::Title--> <h3 class="text-dark fs-5 fw-bold text-gray-800">相关内容</h3> <!--end::Title--> </div> <div class="separator separator-dashed mb-9"></div> <!--end::Block--> <div class="row g-10"> </div> </div> <!--end::Table widget 14--> </div> <!--end::Col--> <!--begin::Col--> <div class="col-xl-4 mt-0"> <!--begin::Chart Widget 35--> <div class="card card-flush h-md-100"> <!--begin::Header--> <div class="card-header pt-5 "> <!--begin::Title--> <h3 class="card-title align-items-start flex-column"> <!--begin::Statistics--> <div class="d-flex align-items-center mb-2"> <!--begin::Currency--> <span class="fs-5 fw-bold text-gray-800 ">热门资讯</span> <!--end::Currency--> </div> <!--end::Statistics--> </h3> <!--end::Title--> </div> <!--end::Header--> <!--begin::Body--> <div class="card-body pt-3"> <!--begin::Item--> <div class="d-flex flex-stack mb-7"> <!--begin::Symbol--> <div class="symbol symbol-60px symbol-2by3 me-4"> <div class="symbol-label" style="background-image: url('/static/assets/images/nopic.gif')"></div> </div> <!--end::Symbol--> <!--begin::Title--> <div class="m-0"> <a href="/program/39278.html" class="text-dark fw-bold text-hover-primary fs-6">500 行 Python 代码...</a> <span class="text-gray-600 fw-semibold d-block pt-1 fs-7">语法分析器描述了一个句子的语法结构,用来帮助其他的应用进行推理。自然语言引入了很多意外的歧义,以我们...</span> </div> <!--end::Title--> </div> <!--begin::Item--> <div class="d-flex flex-stack mb-7"> <!--begin::Symbol--> <div class="symbol symbol-60px symbol-2by3 me-4"> <div class="symbol-label" style="background-image: url('/static/assets/images/nopic.gif')"></div> </div> <!--end::Symbol--> <!--begin::Title--> <div class="m-0"> <a href="/program/831667.html" class="text-dark fw-bold text-hover-primary fs-6">定时清理删除C:\Progra...</a> <span class="text-gray-600 fw-semibold d-block pt-1 fs-7">C:\Program Files (x86)下面很多scoped_dir开头的文件夹 写个批处理 定...</span> </div> <!--end::Title--> </div> <!--begin::Item--> <div class="d-flex flex-stack mb-7"> <!--begin::Symbol--> <div class="symbol symbol-60px symbol-2by3 me-4"> <div class="symbol-label" style="background-image: url('/static/assets/images/nopic.gif')"></div> </div> <!--end::Symbol--> <!--begin::Title--> <div class="m-0"> <a href="/program/3333.html" class="text-dark fw-bold text-hover-primary fs-6">65536是2的几次方 计算2...</a> <span class="text-gray-600 fw-semibold d-block pt-1 fs-7">65536是2的16次方:65536=2⁶ 65536是256的2次方:65536=256 6553...</span> </div> <!--end::Title--> </div> <!--begin::Item--> <div class="d-flex flex-stack mb-7"> <!--begin::Symbol--> <div class="symbol symbol-60px symbol-2by3 me-4"> <div class="symbol-label" style="background-image: url('/static/assets/images/nopic.gif')"></div> </div> <!--end::Symbol--> <!--begin::Title--> <div class="m-0"> <a href="/program/4386.html" class="text-dark fw-bold text-hover-primary fs-6">Mobi、epub格式电子书如...</a> <span class="text-gray-600 fw-semibold d-block pt-1 fs-7">在wps里全局设置里有一个文件关联,打开,勾选电子书文件选项就可以了。</span> </div> <!--end::Title--> </div> <!--begin::Item--> <div class="d-flex flex-stack mb-7"> <!--begin::Symbol--> <div class="symbol symbol-60px symbol-2by3 me-4"> <div class="symbol-label" style="background-image: url('/uploadfile/202403/9fc6c8bf38a85fb.png#没有设置高宽参数,将以原图输出')"></div> </div> <!--end::Symbol--> <!--begin::Title--> <div class="m-0"> <a href="/program/831666.html" class="text-dark fw-bold text-hover-primary fs-6">scoped_dir32_70...</a> <span class="text-gray-600 fw-semibold d-block pt-1 fs-7">一台虚拟机C盘总是莫名奇妙的空间用完,导致很多软件没法再运行。经过仔细检查发现是C:\Program...</span> </div> <!--end::Title--> </div> <!--begin::Item--> <div class="d-flex flex-stack mb-7"> <!--begin::Symbol--> <div class="symbol symbol-60px symbol-2by3 me-4"> <div class="symbol-label" style="background-image: url('/static/assets/images/nopic.gif')"></div> </div> <!--end::Symbol--> <!--begin::Title--> <div class="m-0"> <a href="/program/783.html" class="text-dark fw-bold text-hover-primary fs-6">小程序支付时提示:appid和...</a> <span class="text-gray-600 fw-semibold d-block pt-1 fs-7">[Q]小程序支付时提示:appid和mch_id不匹配 [A]小程序和微信支付没有进行关联,访问“小...</span> </div> <!--end::Title--> </div> <!--begin::Item--> <div class="d-flex flex-stack mb-7"> <!--begin::Symbol--> <div class="symbol symbol-60px symbol-2by3 me-4"> <div class="symbol-label" style="background-image: url('/static/assets/images/nopic.gif')"></div> </div> <!--end::Symbol--> <!--begin::Title--> <div class="m-0"> <a href="/program/831649.html" class="text-dark fw-bold text-hover-primary fs-6"> pycparser 是一个用...</a> <span class="text-gray-600 fw-semibold d-block pt-1 fs-7">`pycparser` 是一个用 Python 编写的 C 语言解析器。它可以用来解析 C 代码并构...</span> </div> <!--end::Title--> </div> <!--begin::Item--> <div class="d-flex flex-stack mb-7"> <!--begin::Symbol--> <div class="symbol symbol-60px symbol-2by3 me-4"> <div class="symbol-label" style="background-image: url('/static/assets/images/nopic.gif')"></div> </div> <!--end::Symbol--> <!--begin::Title--> <div class="m-0"> <a href="/program/4837.html" class="text-dark fw-bold text-hover-primary fs-6">微信小程序使用slider实现...</a> <span class="text-gray-600 fw-semibold d-block pt-1 fs-7">众所周知哈,微信小程序里面的音频播放是没有进度条的,但最近有个项目呢,客户要求音频要有进度条控制,所...</span> </div> <!--end::Title--> </div> <!--begin::Item--> <div class="d-flex flex-stack mb-7"> <!--begin::Symbol--> <div class="symbol symbol-60px symbol-2by3 me-4"> <div class="symbol-label" style="background-image: url('/static/assets/images/nopic.gif')"></div> </div> <!--end::Symbol--> <!--begin::Title--> <div class="m-0"> <a href="/program/1628.html" class="text-dark fw-bold text-hover-primary fs-6">python查找阿姆斯特朗数</a> <span class="text-gray-600 fw-semibold d-block pt-1 fs-7">题目解释 如果一个n位正整数等于其各位数字的n次方之和,则称该数为阿姆斯特朗数。 例如1^3 + 5...</span> </div> <!--end::Title--> </div> <!--begin::Item--> <div class="d-flex flex-stack mb-7"> <!--begin::Symbol--> <div class="symbol symbol-60px symbol-2by3 me-4"> <div class="symbol-label" style="background-image: url('/static/assets/images/nopic.gif')"></div> </div> <!--end::Symbol--> <!--begin::Title--> <div class="m-0"> <a href="/program/831541.html" class="text-dark fw-bold text-hover-primary fs-6">Apache Doris 2....</a> <span class="text-gray-600 fw-semibold d-block pt-1 fs-7">亲爱的社区小伙伴们,我们很高兴地向大家宣布,Apache Doris 2.0.0 版本已于...</span> </div> <!--end::Title--> </div> </div> <!--end::Body--> </div> <!--end::Chart Widget 35--> </div> <!--end::Col--> </div> </div> <!--end::Content container--> </div> <!--end::Content--> </div> <!--end::Content wrapper--> <!--begin::Footer--> <div id="kt_app_footer" class="app-footer"> <!--begin::Footer container--> <div class="app-container container-xxl d-flex flex-column flex-md-row flex-center flex-md-stack py-3"> <!--begin::Copyright--> <div class="text-dark order-2 order-md-1"> <span class="text-muted fw-semibold me-1">2025 ©</span> 晓说杂谈<script> var _hmt = _hmt || []; (function() { var hm = document.createElement("script"); hm.src = "https://hm.baidu.com/hm.js?f7b4581e1f9f88ac28d46df58a8d3ff5"; var s = document.getElementsByTagName("script")[0]; s.parentNode.insertBefore(hm, s); })(); </script> <a target="_blank" href="https://beian.miit.gov.cn/">豫ICP备13019747号-13</a> </div> <!--end::Copyright--> <!--begin::Menu--> <ul class="menu menu-gray-600 menu-hover-primary fw-semibold order-1"> <li class="menu-item"> <a href="/tech" target="_blank" class="menu-link px-2">科技分享</a> </li> <li class="menu-item"> <a href="/web" target="_blank" class="menu-link px-2">网络技术</a> </li> <li class="menu-item"> <a href="/hardware" target="_blank" class="menu-link px-2">硬件设备</a> </li> <li class="menu-item"> <a href="/program" target="_blank" class="menu-link px-2">程序人生</a> </li> <li class="menu-item"> <a href="/jinrong" target="_blank" class="menu-link px-2">探索发现</a> </li> <li class="menu-item"> <a href="/jixie" target="_blank" class="menu-link px-2">机械加工</a> </li> <li class="menu-item"> <a href="/dianshang" target="_blank" class="menu-link px-2">电商</a> </li> <li class="menu-item"> <a href="/other" target="_blank" class="menu-link px-2">其他</a> </li> <li class="menu-item"> <a href="/zhishi" target="_blank" class="menu-link px-2">日常知识</a> </li> <li class="menu-item"> <a href="/yulu" target="_blank" class="menu-link px-2">每日语录</a> </li> </ul> <!--end::Menu--> </div> <!--end::Footer container--> </div> <!--end::Footer--> </div> <!--end:::Main--> </div> <!--end::Wrapper--> </div> <!--end::Page--> </div> <!--end::App--> <div id="kt_scrolltop" class="scrolltop" data-kt-scrolltop="true"> <!--begin::Svg Icon | path: icons/duotune/arrows/arr066.svg--> <span class="svg-icon"> <svg width="24" height="24" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg"> <rect opacity="0.5" x="13" y="6" width="13" height="2" rx="1" transform="rotate(90 13 6)" fill="currentColor"></rect> <path d="M12.5657 8.56569L16.75 12.75C17.1642 13.1642 17.8358 13.1642 18.25 12.75C18.6642 12.3358 18.6642 11.6642 18.25 11.25L12.7071 5.70711C12.3166 5.31658 11.6834 5.31658 11.2929 5.70711L5.75 11.25C5.33579 11.6642 5.33579 12.3358 5.75 12.75C6.16421 13.1642 6.83579 13.1642 7.25 12.75L11.4343 8.56569C11.7467 8.25327 12.2533 8.25327 12.5657 8.56569Z" fill="currentColor"></path> </svg> </span> <!--end::Svg Icon--> </div> <!--begin::Javascript--> <script>var hostUrl = "/static/default/pc/";</script> <!--begin::Global Javascript Bundle(mandatory for all pages)--> <script src="/static/default/pc/plugins/global/plugins.bundle.js"></script> <script src="/static/default/pc/js/scripts.bundle.js"></script> <!--end::Global Javascript Bundle--> <!--end::Javascript--> </body> <!--end::Body--> </html>