Python爬取读者并制作成PDF
admin
2023-07-31 02:17:14
0

学了下beautifulsoup后,做个个网络爬虫,爬取读者杂志并用reportlab制作成pdf..

crawler.py

复制代码 代码如下:
#!/usr/bin/env python
#coding=utf-8
\”\”\”
    Author:         Anemone
    Filename:       getmain.py
    Last modified:  2015-02-19 16:47
    E-mail:         anemone@82flex.com
\”\”\”
import urllib2
from bs4 import BeautifulSoup
import re
import sys
reload(sys)
sys.setdefaultencoding(\’utf-8\’)
def getEachArticle(url):
#    response = urllib2.urlopen(\’http://www.52duzhe.com/2015_01/duzh20150104.html\’)
    response = urllib2.urlopen(url)
    html = response.read()
    soup = BeautifulSoup(html)#.decode(\”utf-8\”).encode(\”gbk\”))
    #for i in soup.find_all(\’div\’):
    #    print i,1
    title=soup.find(\”h1\”).string
    writer=soup.find(id=\”pub_date\”).string.strip()
    _from=soup.find(id=\”media_name\”).string.strip()
    text=soup.get_text()#.encode(\”utf-8\”)
    main=re.split(\”BAIDU_CLB.*;\”,text)
    result={\”title\”:title,\”writer\”:writer,\”from\”:_from,\”context\”:main[1]}
    return result
    #new=open(\”new.txt\”,\”w\”)
    #new.write(result[\”title\”]+\”\\n\\n\”)
    #new.write(result[\”writer\”]+\”  \”+result[\”from\”])
    #new.write(result[\”context\”])
    #new.close()
def getCatalog(issue):
    url=\”http://www.52duzhe.com/\”+issue[:4]+\”_\”+issue[-2:]+\”/\”
    firstUrl=url+\”duzh\”+issue+\”01.html\”
    firstUrl=url+\”index.html\”
    duzhe=dict()
    response = urllib2.urlopen(firstUrl)
    html = response.read()
    soup=BeautifulSoup(html)
    firstUrl=url+soup.table.a.get(\”href\”)
    response = urllib2.urlopen(firstUrl)
    html = response.read()
    soup = BeautifulSoup(html)
    all=soup.find_all(\”h2\”)
    for i in all:
        print i.string
        duzhe[i.string]=list()
        for link in i.parent.find_all(\”a\”):
            href=url+link.get(\”href\”)
            print href
            while 1:
                try:
                    article=getEachArticle(href)
                    break
                except:
                    continue
            duzhe[i.string].append(article)
    return duzhe
def readDuZhe(duzhe):
    for eachColumn in duzhe:
        for eachArticle in duzhe[eachColumn]:
            print eachArticle[\”title\”]
if __name__ == \’__main__\’:
#    issue=raw_input(\”issue(201501):\”)
    readDuZhe(getCatalog(\”201424\”))

getpdf.py

复制代码 代码如下:
#!/usr/bin/env python
#coding=utf-8
\”\”\”
    Author:         Anemone
    Filename:       writetopdf.py
    Last modified:  2015-02-20 19:19
    E-mail:         anemone@82flex.com
\”\”\”
#coding=utf-8
import reportlab.rl_config
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib import fonts
import copy
from reportlab.platypus import Paragraph, SimpleDocTemplate,flowables
from reportlab.lib.styles import getSampleStyleSheet
import crawler
def writePDF(issue,duzhe):
    reportlab.rl_config.warnOnMissingFontGlyphs = 0
    pdfmetrics.registerFont(TTFont(\’song\’,\”simsun.ttc\”))
    pdfmetrics.registerFont(TTFont(\’hei\’,\”msyh.ttc\”))
    fonts.addMapping(\’song\’, 0, 0, \’song\’)
    fonts.addMapping(\’song\’, 0, 1, \’song\’)
    fonts.addMapping(\’song\’, 1, 0, \’hei\’)
    fonts.addMapping(\’song\’, 1, 1, \’hei\’)
    stylesheet=getSampleStyleSheet()
    normalStyle = copy.deepcopy(stylesheet[\’Normal\’])
    normalStyle.fontName =\’song\’
    normalStyle.fontSize = 11
    normalStyle.leading = 11
    normalStyle.firstLineIndent = 20
    titleStyle = copy.deepcopy(stylesheet[\’Normal\’])
    titleStyle.fontName =\’song\’
    titleStyle.fontSize = 15
    titleStyle.leading = 20
    firstTitleStyle = copy.deepcopy(stylesheet[\’Normal\’])
    firstTitleStyle.fontName =\’song\’
    firstTitleStyle.fontSize = 20
    firstTitleStyle.leading = 20
    firstTitleStyle.firstLineIndent = 50
    smallStyle = copy.deepcopy(stylesheet[\’Normal\’])
    smallStyle.fontName =\’song\’
    smallStyle.fontSize = 8
    smallStyle.leading = 8
    story = []
    story.append(Paragraph(\”读者{0}期\”.format(issue), firstTitleStyle))
    for eachColumn in duzhe:
        story.append(Paragraph(\’__\’*28, titleStyle))
        story.append(Paragraph(\'{0}\’.format(eachColumn), titleStyle))
        for eachArticle in duzhe[eachColumn]:
            story.append(Paragraph(eachArticle[\”title\”],normalStyle))
    story.append(flowables.PageBreak())
    for eachColumn in duzhe:
        for eachArticle in duzhe[eachColumn]:
            story.append(Paragraph(\”{0}\”.format(eachArticle[\”title\”]),titleStyle))
            story.append(Paragraph(\” {0}  {1}\”.format(eachArticle[\”writer\”],eachArticle[\”from\”]),smallStyle))
            para=eachArticle[\”context\”].split(\”  \”)
            for eachPara in para:
                story.append(Paragraph(eachPara,normalStyle))
            story.append(flowables.PageBreak())
    #story.append(Paragraph(\”context\”,normalStyle))
    doc = SimpleDocTemplate(\”duzhe\”+issue+\”.pdf\”)
    print \”Writing PDF…\”
    doc.build(story)
def main(issue):
    duzhe=crawler.getCatalog(issue)
    writePDF(issue,duzhe)
if __name__ == \’__main__\’:
    issue=raw_input(\”Enter issue(201501):\”)
    main(issue)

以上就是本文的全部内容了,希望大家能够喜欢。

相关内容

热门资讯

500 行 Python 代码... 语法分析器描述了一个句子的语法结构,用来帮助其他的应用进行推理。自然语言引入了很多意外的歧义,以我们...
定时清理删除C:\Progra... C:\Program Files (x86)下面很多scoped_dir开头的文件夹 写个批处理 定...
65536是2的几次方 计算2... 65536是2的16次方:65536=2⁶ 65536是256的2次方:65536=256 6553...
Mobi、epub格式电子书如... 在wps里全局设置里有一个文件关联,打开,勾选电子书文件选项就可以了。
scoped_dir32_70... 一台虚拟机C盘总是莫名奇妙的空间用完,导致很多软件没法再运行。经过仔细检查发现是C:\Program...
小程序支付时提示:appid和... [Q]小程序支付时提示:appid和mch_id不匹配 [A]小程序和微信支付没有进行关联,访问“小...
pycparser 是一个用... `pycparser` 是一个用 Python 编写的 C 语言解析器。它可以用来解析 C 代码并构...
微信小程序使用slider实现... 众所周知哈,微信小程序里面的音频播放是没有进度条的,但最近有个项目呢,客户要求音频要有进度条控制,所...
python查找阿姆斯特朗数 题目解释 如果一个n位正整数等于其各位数字的n次方之和,则称该数为阿姆斯特朗数。 例如1^3 + 5...
Apache Doris 2.... 亲爱的社区小伙伴们,我们很高兴地向大家宣布,Apache Doris 2.0.0 版本已于...