python读取html中指定元素生成excle文件示例_程序人生

python读取html中指定元素生成excle文件示例

admin

2023-07-31 02:06:59

0次

Python2.7编写的读取html中指定元素，并生成excle文件

复制代码代码如下:
#coding=gbk
import string
import codecs
import os,time
import xlwt
import xlrd
from bs4 import BeautifulSoup
from xlrd import open_workbook

class LogMsg:
        def __init__(self,logfile,Level=0):
                try:
                        import logging
                        #self.logger = None
                        self.logger = logging.getLogger()
                        self.hdlr = logging.FileHandler(logfile)
                        formatter = logging.Formatter(\”[%(asctime)s]: %(message)s\”,\”%Y%m%d %H:%M:%S\”)
                        self.hdlr.setFormatter(formatter)
                        self.logger.addHandler(self.hdlr)
                        #logger.setLevel()
                        if Level == 10:
                                self.logger.setLevel(logging.DEBUG)
                        elif Level == 20:
                                self.logger.setLevel(logging.INFO)
                        elif Level == 30:
                                self.logger.setLevel(logging.WARNING)
                        elif Level == 40:
                                self.logger.setLevel(logging.ERROR)
                        elif Level == 50:
                                self.logger.setLevel(logging.CRITICAL)
                        else:
                                self.logger.setLevel(logging.NOTSET)
                except:
                        print \”log init error!\”
                        exit(1)

        def output(self,logInfo):
                Level = self.logger.getEffectiveLevel()
                try:
                        if Level == 10:
                                self.logger.debug(logInfo)
                        elif Level == 20:
                                self.logger.info(logInfo)
                        elif Level == 30:
                                self.logger.warning(logInfo)
                        elif Level == 40:
                                self.logger.error(logInfo)
                        elif Level == 50:
                                self.logger.critical(logInfo)
                        else:
                                self.logger.info(logInfo)
                except:
                        print \”log output error!\”
                        exit(1)

        def close(self):
                try:
                #logging.shutdown([self.hdlr])
                        self.logger.removeHandler(self.hdlr)
                except:
                        print \”log closed error!\”
                        exit(1)

Logtime = time.strftime(\”%Y%m%d%H%M%S\”,time.localtime())
logFileTime = time.strftime(\”%Y%m%d\”,time.localtime())
Logfile = \’/data/pyExample/logs/htmlparser_%s.log\’ % logFileTime
log = LogMsg(Logfile,20)

DATAPATH = \’/data/pyExample/\’
XLSname = \’dangjian_\’+Logtime+\’.xls\’

if __name__ == \’__main__\’:

    wbk = xlwt.Workbook(encoding = \’gbk\’)
    sheet = wbk.add_sheet(\’基本内容导入模板\’)
    sheet.write(0,0,\’内容类型 \’)
    sheet.write(0,1,\’栏目名称\’)
    sheet.write(0,2,\’栏目编号\’)
    sheet.write(0,3,\’内容名称\’)
    sheet.write(0,4,\’时长\’)
    sheet.write(0,5,\’关键字\’)
    sheet.write(0,6,\’看点\’)
    sheet.write(0,7,\’作者\’)
    sheet.write(0,8,\’来源\’)
    sheet.write(0,9,\’子内容1\’)
    sheet.write(0,10,\’子内容2\’)
    xlsContent = []
    files = os.listdir(DATAPATH)
    k = 0
    for f in files:
        if os.path.splitext(f)[1] == \’.html\’:
            content=[]
            log.output(\’当前文件：\’+f)
            htmlFile =codecs.open(DATAPATH+f,\’r\’,\’gbk\’)
            lines = htmlFile.readlines()
            if not lines:
                log.output (\’not line\’)
            for line in lines:
                if line.strip()==\’\\n\’:
                    log.output(\’该处是空行\’)
                else:
                    line = line.replace(\’ \’,\’\’)
                    soup = BeautifulSoup(line)
                    for tdd in soup.findAll(\’td\’):
                        #print tdd.text.encode(\”gbk\”)
                        content.append(tdd.text.encode(\”gbk\”))
                #print line.encode(\’gbk\’)
            htmlFile.close()
            for i in content:
                print content.index(i),\’,\’,i
                log.output(i)
                log.output(content.index(i))
            print \’—————————————-\’

            folderName = content[6]
            contentName= content[4]
            duration =    filter(str.isdigit, content[16])
            int_duration = string.atoi(duration)*60
            str_duration = \”%i\”%int_duration
            keyWord =     content[6]
            desciption = content[36]
            videoName_1 = content[10]
            print folderName
            print contentName
            print str_duration
            print keyWord
            print desciption
            print videoName_1
            log.output(\’输出xls数据：\’+\’,\’+folderName+\’,,\’+contentName+\’,\’+str_duration+\’,\’+keyWord+\’,\’+desciption+\’,管理员,华数编辑,\’+videoName_1+\’,,\’)
            print k
            sheet.write(k+1,0,\’\’)
            sheet.write(k+1,1,folderName)
            sheet.write(k+1,2,\’\’)
            sheet.write(k+1,3,contentName)
            sheet.write(k+1,4,str_duration)
            sheet.write(k+1,5,keyWord)
            sheet.write(k+1,6,desciption)
            sheet.write(k+1,7,\’管理员\’)
            sheet.write(k+1,8,\’华数编辑\’)
            sheet.write(k+1,9,videoName_1)
            sheet.write(k+1,10,\’\’)
            k+=1

wbk.save(DATAPATH + XLSname)

print \’=========================================\’

生成excle 读取html

上一篇：Python collections模块实例讲解

下一篇：python生成随机验证码(中文验证码)示例

python读取html中指定元素生成excle文件示例

相关内容

热门资讯