Python2.7编写的读取html中指定元素,并生成excle文件
复制代码 代码如下:
#coding=gbk
import string
import codecs
import os,time
import xlwt
import xlrd
from bs4 import BeautifulSoup
from xlrd import open_workbook
class LogMsg:
def __init__(self,logfile,Level=0):
try:
import logging
#self.logger = None
self.logger = logging.getLogger()
self.hdlr = logging.FileHandler(logfile)
formatter = logging.Formatter(\”[%(asctime)s]: %(message)s\”,\”%Y%m%d %H:%M:%S\”)
self.hdlr.setFormatter(formatter)
self.logger.addHandler(self.hdlr)
#logger.setLevel()
if Level == 10:
self.logger.setLevel(logging.DEBUG)
elif Level == 20:
self.logger.setLevel(logging.INFO)
elif Level == 30:
self.logger.setLevel(logging.WARNING)
elif Level == 40:
self.logger.setLevel(logging.ERROR)
elif Level == 50:
self.logger.setLevel(logging.CRITICAL)
else:
self.logger.setLevel(logging.NOTSET)
except:
print \”log init error!\”
exit(1)
def output(self,logInfo):
Level = self.logger.getEffectiveLevel()
try:
if Level == 10:
self.logger.debug(logInfo)
elif Level == 20:
self.logger.info(logInfo)
elif Level == 30:
self.logger.warning(logInfo)
elif Level == 40:
self.logger.error(logInfo)
elif Level == 50:
self.logger.critical(logInfo)
else:
self.logger.info(logInfo)
except:
print \”log output error!\”
exit(1)
def close(self):
try:
#logging.shutdown([self.hdlr])
self.logger.removeHandler(self.hdlr)
except:
print \”log closed error!\”
exit(1)
Logtime = time.strftime(\”%Y%m%d%H%M%S\”,time.localtime())
logFileTime = time.strftime(\”%Y%m%d\”,time.localtime())
Logfile = \’/data/pyExample/logs/htmlparser_%s.log\’ % logFileTime
log = LogMsg(Logfile,20)
DATAPATH = \’/data/pyExample/\’
XLSname = \’dangjian_\’+Logtime+\’.xls\’
if __name__ == \’__main__\’:
wbk = xlwt.Workbook(encoding = \’gbk\’)
sheet = wbk.add_sheet(\’基本内容导入模板\’)
sheet.write(0,0,\’内容类型 \’)
sheet.write(0,1,\’栏目名称\’)
sheet.write(0,2,\’栏目编号\’)
sheet.write(0,3,\’内容名称\’)
sheet.write(0,4,\’时长\’)
sheet.write(0,5,\’关键字\’)
sheet.write(0,6,\’看点\’)
sheet.write(0,7,\’作者\’)
sheet.write(0,8,\’来源\’)
sheet.write(0,9,\’子内容1\’)
sheet.write(0,10,\’子内容2\’)
xlsContent = []
files = os.listdir(DATAPATH)
k = 0
for f in files:
if os.path.splitext(f)[1] == \’.html\’:
content=[]
log.output(\’当前文件:\’+f)
htmlFile =codecs.open(DATAPATH+f,\’r\’,\’gbk\’)
lines = htmlFile.readlines()
if not lines:
log.output (\’not line\’)
for line in lines:
if line.strip()==\’\\n\’:
log.output(\’该处是空行\’)
else:
line = line.replace(\’ \’,\’\’)
soup = BeautifulSoup(line)
for tdd in soup.findAll(\’td\’):
#print tdd.text.encode(\”gbk\”)
content.append(tdd.text.encode(\”gbk\”))
#print line.encode(\’gbk\’)
htmlFile.close()
for i in content:
print content.index(i),\’,\’,i
log.output(i)
log.output(content.index(i))
print \’—————————————-\’
folderName = content[6]
contentName= content[4]
duration = filter(str.isdigit, content[16])
int_duration = string.atoi(duration)*60
str_duration = \”%i\”%int_duration
keyWord = content[6]
desciption = content[36]
videoName_1 = content[10]
print folderName
print contentName
print str_duration
print keyWord
print desciption
print videoName_1
log.output(\’输出xls数据:\’+\’,\’+folderName+\’,,\’+contentName+\’,\’+str_duration+\’,\’+keyWord+\’,\’+desciption+\’,管理员,华数编辑,\’+videoName_1+\’,,\’)
print k
sheet.write(k+1,0,\’\’)
sheet.write(k+1,1,folderName)
sheet.write(k+1,2,\’\’)
sheet.write(k+1,3,contentName)
sheet.write(k+1,4,str_duration)
sheet.write(k+1,5,keyWord)
sheet.write(k+1,6,desciption)
sheet.write(k+1,7,\’管理员\’)
sheet.write(k+1,8,\’华数编辑\’)
sheet.write(k+1,9,videoName_1)
sheet.write(k+1,10,\’\’)
k+=1
wbk.save(DATAPATH + XLSname)
print \’=========================================\’