python读取html中指定元素生成excle文件示例
admin
2023-07-31 02:06:59
0

Python2.7编写的读取html中指定元素,并生成excle文件

复制代码 代码如下:
#coding=gbk
import string
import codecs
import os,time
import xlwt
import xlrd
from bs4 import BeautifulSoup
from xlrd import open_workbook

class LogMsg:
        def __init__(self,logfile,Level=0):
                try:
                        import logging
                        #self.logger = None
                        self.logger = logging.getLogger()
                        self.hdlr = logging.FileHandler(logfile)
                        formatter = logging.Formatter(\”[%(asctime)s]: %(message)s\”,\”%Y%m%d %H:%M:%S\”)
                        self.hdlr.setFormatter(formatter)
                        self.logger.addHandler(self.hdlr)
                        #logger.setLevel()
                        if Level == 10:
                                self.logger.setLevel(logging.DEBUG)
                        elif Level == 20:
                                self.logger.setLevel(logging.INFO)
                        elif Level == 30:
                                self.logger.setLevel(logging.WARNING)
                        elif Level == 40:
                                self.logger.setLevel(logging.ERROR)
                        elif Level == 50:
                                self.logger.setLevel(logging.CRITICAL)
                        else:
                                self.logger.setLevel(logging.NOTSET)
                except:
                        print \”log init error!\”
                        exit(1)

        def output(self,logInfo):
                Level = self.logger.getEffectiveLevel()
                try:
                        if Level == 10:
                                self.logger.debug(logInfo)
                        elif Level == 20:
                                self.logger.info(logInfo)
                        elif Level == 30:
                                self.logger.warning(logInfo)
                        elif Level == 40:
                                self.logger.error(logInfo)
                        elif Level == 50:
                                self.logger.critical(logInfo)
                        else:
                                self.logger.info(logInfo)
                except:
                        print \”log output error!\”
                        exit(1)

        def close(self):
                try:
                #logging.shutdown([self.hdlr])
                        self.logger.removeHandler(self.hdlr)
                except:
                        print \”log closed error!\”
                        exit(1)

Logtime = time.strftime(\”%Y%m%d%H%M%S\”,time.localtime())
logFileTime = time.strftime(\”%Y%m%d\”,time.localtime())
Logfile = \’/data/pyExample/logs/htmlparser_%s.log\’ % logFileTime
log = LogMsg(Logfile,20)

DATAPATH = \’/data/pyExample/\’
XLSname = \’dangjian_\’+Logtime+\’.xls\’

if __name__ == \’__main__\’:
   

    wbk = xlwt.Workbook(encoding = \’gbk\’)
    sheet = wbk.add_sheet(\’基本内容导入模板\’)
    sheet.write(0,0,\’内容类型 \’)
    sheet.write(0,1,\’栏目名称\’)
    sheet.write(0,2,\’栏目编号\’)
    sheet.write(0,3,\’内容名称\’)
    sheet.write(0,4,\’时长\’)
    sheet.write(0,5,\’关键字\’)
    sheet.write(0,6,\’看点\’)
    sheet.write(0,7,\’作者\’)
    sheet.write(0,8,\’来源\’)
    sheet.write(0,9,\’子内容1\’)
    sheet.write(0,10,\’子内容2\’)
    xlsContent = []  
    files = os.listdir(DATAPATH)
    k = 0
    for f in files: 
        if os.path.splitext(f)[1] == \’.html\’:
            content=[]
            log.output(\’当前文件:\’+f)
            htmlFile =codecs.open(DATAPATH+f,\’r\’,\’gbk\’)
            lines = htmlFile.readlines()
            if not lines:
                log.output (\’not line\’)
            for line in lines:
                if line.strip()==\’\\n\’:
                    log.output(\’该处是空行\’)
                else:
                    line = line.replace(\’ \’,\’\’)
                    soup  = BeautifulSoup(line)
                    for tdd in soup.findAll(\’td\’): 
                        #print tdd.text.encode(\”gbk\”)
                        content.append(tdd.text.encode(\”gbk\”))      
                #print line.encode(\’gbk\’)
            htmlFile.close()   
            for i in content:
                print content.index(i),\’,\’,i
                log.output(i)
                log.output(content.index(i))
            print \’—————————————-\’
           

            folderName =  content[6]
            contentName=  content[4]      
            duration =    filter(str.isdigit, content[16])
            int_duration = string.atoi(duration)*60
            str_duration = \”%i\”%int_duration
            keyWord =     content[6]
            desciption =  content[36]
            videoName_1 = content[10]
            print folderName
            print contentName
            print str_duration
            print keyWord
            print desciption
            print videoName_1
            log.output(\’输出xls数据:\’+\’,\’+folderName+\’,,\’+contentName+\’,\’+str_duration+\’,\’+keyWord+\’,\’+desciption+\’,管理员,华数编辑,\’+videoName_1+\’,,\’)
            print k           
            sheet.write(k+1,0,\’\’)
            sheet.write(k+1,1,folderName)
            sheet.write(k+1,2,\’\’)
            sheet.write(k+1,3,contentName)
            sheet.write(k+1,4,str_duration)
            sheet.write(k+1,5,keyWord)
            sheet.write(k+1,6,desciption)
            sheet.write(k+1,7,\’管理员\’)
            sheet.write(k+1,8,\’华数编辑\’)
            sheet.write(k+1,9,videoName_1)
            sheet.write(k+1,10,\’\’)
            k+=1

    wbk.save(DATAPATH + XLSname)       

    print \’=========================================\’ 

相关内容

热门资讯

500 行 Python 代码... 语法分析器描述了一个句子的语法结构,用来帮助其他的应用进行推理。自然语言引入了很多意外的歧义,以我们...
Mobi、epub格式电子书如... 在wps里全局设置里有一个文件关联,打开,勾选电子书文件选项就可以了。
定时清理删除C:\Progra... C:\Program Files (x86)下面很多scoped_dir开头的文件夹 写个批处理 定...
scoped_dir32_70... 一台虚拟机C盘总是莫名奇妙的空间用完,导致很多软件没法再运行。经过仔细检查发现是C:\Program...
65536是2的几次方 计算2... 65536是2的16次方:65536=2⁶ 65536是256的2次方:65536=256 6553...
小程序支付时提示:appid和... [Q]小程序支付时提示:appid和mch_id不匹配 [A]小程序和微信支付没有进行关联,访问“小...
pycparser 是一个用... `pycparser` 是一个用 Python 编写的 C 语言解析器。它可以用来解析 C 代码并构...
微信小程序使用slider实现... 众所周知哈,微信小程序里面的音频播放是没有进度条的,但最近有个项目呢,客户要求音频要有进度条控制,所...
Apache Doris 2.... 亲爱的社区小伙伴们,我们很高兴地向大家宣布,Apache Doris 2.0.0 版本已于...
python清除字符串里非数字... 本文实例讲述了python清除字符串里非数字字符的方法。分享给大家供大家参考。具体如下: impor...