学了下beautifulsoup后,做个个网络爬虫,爬取读者杂志并用reportlab制作成pdf..
crawler.py
复制代码 代码如下:
#!/usr/bin/env python
#coding=utf-8
\”\”\”
Author: Anemone
Filename: getmain.py
Last modified: 2015-02-19 16:47
E-mail: anemone@82flex.com
\”\”\”
import urllib2
from bs4 import BeautifulSoup
import re
import sys
reload(sys)
sys.setdefaultencoding(\’utf-8\’)
def getEachArticle(url):
# response = urllib2.urlopen(\’http://www.52duzhe.com/2015_01/duzh20150104.html\’)
response = urllib2.urlopen(url)
html = response.read()
soup = BeautifulSoup(html)#.decode(\”utf-8\”).encode(\”gbk\”))
#for i in soup.find_all(\’div\’):
# print i,1
title=soup.find(\”h1\”).string
writer=soup.find(id=\”pub_date\”).string.strip()
_from=soup.find(id=\”media_name\”).string.strip()
text=soup.get_text()#.encode(\”utf-8\”)
main=re.split(\”BAIDU_CLB.*;\”,text)
result={\”title\”:title,\”writer\”:writer,\”from\”:_from,\”context\”:main[1]}
return result
#new=open(\”new.txt\”,\”w\”)
#new.write(result[\”title\”]+\”\\n\\n\”)
#new.write(result[\”writer\”]+\” \”+result[\”from\”])
#new.write(result[\”context\”])
#new.close()
def getCatalog(issue):
url=\”http://www.52duzhe.com/\”+issue[:4]+\”_\”+issue[-2:]+\”/\”
firstUrl=url+\”duzh\”+issue+\”01.html\”
firstUrl=url+\”index.html\”
duzhe=dict()
response = urllib2.urlopen(firstUrl)
html = response.read()
soup=BeautifulSoup(html)
firstUrl=url+soup.table.a.get(\”href\”)
response = urllib2.urlopen(firstUrl)
html = response.read()
soup = BeautifulSoup(html)
all=soup.find_all(\”h2\”)
for i in all:
print i.string
duzhe[i.string]=list()
for link in i.parent.find_all(\”a\”):
href=url+link.get(\”href\”)
print href
while 1:
try:
article=getEachArticle(href)
break
except:
continue
duzhe[i.string].append(article)
return duzhe
def readDuZhe(duzhe):
for eachColumn in duzhe:
for eachArticle in duzhe[eachColumn]:
print eachArticle[\”title\”]
if __name__ == \’__main__\’:
# issue=raw_input(\”issue(201501):\”)
readDuZhe(getCatalog(\”201424\”))
getpdf.py
复制代码 代码如下:
#!/usr/bin/env python
#coding=utf-8
\”\”\”
Author: Anemone
Filename: writetopdf.py
Last modified: 2015-02-20 19:19
E-mail: anemone@82flex.com
\”\”\”
#coding=utf-8
import reportlab.rl_config
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib import fonts
import copy
from reportlab.platypus import Paragraph, SimpleDocTemplate,flowables
from reportlab.lib.styles import getSampleStyleSheet
import crawler
def writePDF(issue,duzhe):
reportlab.rl_config.warnOnMissingFontGlyphs = 0
pdfmetrics.registerFont(TTFont(\’song\’,\”simsun.ttc\”))
pdfmetrics.registerFont(TTFont(\’hei\’,\”msyh.ttc\”))
fonts.addMapping(\’song\’, 0, 0, \’song\’)
fonts.addMapping(\’song\’, 0, 1, \’song\’)
fonts.addMapping(\’song\’, 1, 0, \’hei\’)
fonts.addMapping(\’song\’, 1, 1, \’hei\’)
stylesheet=getSampleStyleSheet()
normalStyle = copy.deepcopy(stylesheet[\’Normal\’])
normalStyle.fontName =\’song\’
normalStyle.fontSize = 11
normalStyle.leading = 11
normalStyle.firstLineIndent = 20
titleStyle = copy.deepcopy(stylesheet[\’Normal\’])
titleStyle.fontName =\’song\’
titleStyle.fontSize = 15
titleStyle.leading = 20
firstTitleStyle = copy.deepcopy(stylesheet[\’Normal\’])
firstTitleStyle.fontName =\’song\’
firstTitleStyle.fontSize = 20
firstTitleStyle.leading = 20
firstTitleStyle.firstLineIndent = 50
smallStyle = copy.deepcopy(stylesheet[\’Normal\’])
smallStyle.fontName =\’song\’
smallStyle.fontSize = 8
smallStyle.leading = 8
story = []
story.append(Paragraph(\”读者{0}期\”.format(issue), firstTitleStyle))
for eachColumn in duzhe:
story.append(Paragraph(\’__\’*28, titleStyle))
story.append(Paragraph(\'{0}\’.format(eachColumn), titleStyle))
for eachArticle in duzhe[eachColumn]:
story.append(Paragraph(eachArticle[\”title\”],normalStyle))
story.append(flowables.PageBreak())
for eachColumn in duzhe:
for eachArticle in duzhe[eachColumn]:
story.append(Paragraph(\”{0}\”.format(eachArticle[\”title\”]),titleStyle))
story.append(Paragraph(\” {0} {1}\”.format(eachArticle[\”writer\”],eachArticle[\”from\”]),smallStyle))
para=eachArticle[\”context\”].split(\” \”)
for eachPara in para:
story.append(Paragraph(eachPara,normalStyle))
story.append(flowables.PageBreak())
#story.append(Paragraph(\”context\”,normalStyle))
doc = SimpleDocTemplate(\”duzhe\”+issue+\”.pdf\”)
print \”Writing PDF…\”
doc.build(story)
def main(issue):
duzhe=crawler.getCatalog(issue)
writePDF(issue,duzhe)
if __name__ == \’__main__\’:
issue=raw_input(\”Enter issue(201501):\”)
main(issue)
以上就是本文的全部内容了,希望大家能够喜欢。
上一篇:Python计算回文数的方法