使用python解析xml成对应的html示例分享
admin
2023-07-31 02:07:11
0

SAX将dd.xml解析成html。当然啦,如果得到了xml对应的xsl文件可以直接用libxml2将其转换成html。

复制代码 代码如下:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#—————————————
#   程序:XML解析器
#   版本:01.0
#   作者:mupeng
#   日期:2013-12-18
#   语言:Python 2.7
#   功能:将xml解析成对应的html
#   注解:该程序用xml.sax模块的parse函数解析XML,并生成事件
#   继承ContentHandler并重写其事件处理函数
#   Dispatcher主要用于相应标签的起始、结束事件的派发
#—————————————
from xml.sax.handler import ContentHandler
from xml.sax import parse

class Dispatcher:
    def dispatch(self, prefix, name, attrs=None):
        mname = prefix + name.capitalize()
        dname = \’default\’ + prefix.capitalize()
        method = getattr(self, mname, None)
        if callable(method): args = ()
        else:
            method = getattr(self, dname, None)
            #args = name
        #if prefix == \’start\’: args += attrs
        if callable(method): method()

    def startElement(self, name, attrs):
        self.dispatch(\’start\’, name, attrs)

    def endElement(self, name):
        self.dispatch(\’end\’, name)

class Website(Dispatcher, ContentHandler):

    def __init__(self):
        self.fout = open(\’ddt_SAX.html\’, \’w\’)
        self.imagein = False
        self.desflag = False
        self.item = False
        self.title = \’\’
        self.link = \’\’
        self.guid = \’\’
        self.url = \’\’
        self.pubdate = \’\’
        self.description = \’\’
        self.temp = \’\’
        self.prx = \’\’
    def startChannel(self):

        self.fout.write(\’\’\'\\n\\n RSS-\’\’\’)</p> <p>    def endChannel(self):<br />       self.fout.write(\’\’\’<br />                    <tr><td height=\”20\”></td></tr><br />                    </table><br />                    </center><br />                    <script><br />    function  GetTimeDiff(str)<br />    {<br />     if(str == \’\’)<br />     {<br />      return \’\’;<br />     }</p> <p>     var pubDate = new Date(str);<br />     var nowDate = new Date();<br />     var diffMilSeconds = nowDate.valueOf()-pubDate.valueOf();<br />     var days = diffMilSeconds/86400000;<br />     days = parseInt(days);</p> <p>     diffMilSeconds = diffMilSeconds-(days*86400000);<br />     var hours = diffMilSeconds/3600000;<br />     hours = parseInt(hours);</p> <p>     diffMilSeconds = diffMilSeconds-(hours*3600000);<br />     var minutes = diffMilSeconds/60000;<br />     minutes = parseInt(minutes);</p> <p>     diffMilSeconds = diffMilSeconds-(minutes*60000);<br />     var seconds = diffMilSeconds/1000;<br />     seconds = parseInt(seconds);</p> <p>     var returnStr = \”±±¾©·¢²¼Ê±¼ä£º\” + pubDate.toLocaleString();</p> <p>     if(days > 0)<br />     {<br />      returnStr = returnStr + \” £¨¾àÀëÏÖÔÚ\” + days + \”Ìì\” + hours + \”Сʱ\” + minutes + \”·ÖÖÓ£©\”;<br />     }<br />     else if (hours > 0)<br />     {<br />      returnStr = returnStr + \” £¨¾àÀëÏÖÔÚ\” + hours + \”Сʱ\” + minutes + \”·ÖÖÓ£©\”;<br />     }<br />     else if (minutes > 0)<br />     {<br />      returnStr = returnStr + \” £¨¾àÀëÏÖÔÚ\” + minutes + \”·ÖÖÓ£©\”;<br />     }</p> <p>     return returnStr;</p> <p>    }</p> <p>    function GetSpanText()<br />    {<br />     var pubDate;<br />     var pubDateArray;<br />     var spanArray = document.getElementsByTagName(\”span\”);</p> <p>     for(var i = 0; i < spanArray.length; i++)<br />     {<br />      pubDate = spanArray[i].innerHTML;<br />      document.getElementsByTagName(\”span\”)[i].innerHTML = GetTimeDiff(pubDate);   <br />     }<br />    }</p> <p>    GetSpanText();<br />   </script><br />                </body><br />                </html><br />                \’\’\’)<br />       self.fout.close()</p> <p>    def characters(self, chars):<br />        if chars.strip():<br />            #chars = chars.strip()<br />            self.temp += chars<br />            #print self.temp</p> <p>       <br />    def startTitle(self):</p> <p>        if self.item:<br />            self.fout.write(\’\’\’<br />                        <tr bgcolor=\”#eeeeee\”>\\n<td style=\”padding-top:5px;padding-left:5px;\” height=\”30\”>\\n<B><br />                    \’\’\’)</p> <p>    def endTitle(self):</p> <p>        if not self.imagein and not self.item:<br />            self.title = self.temp<br />            self.temp = \’\’<br />            self.fout.write(self.title.encode(\’gb2312\’))</p> <p>            #self.title = self.temp<br />            self.fout.write(\’\’\’<br />                \\n\\n\\n

\\n
                \\n
                \\n
                \\n
               
                       
                       
                       
                           
                           
                       
                       
\\n
            \’\’\’)

        if self.item:
            self.title = self.temp
            self.temp = \’\’
            self.fout.write(self.title.encode(\’gb2312\’))
            self.fout.write(\’\’\’
                       
                       


                        \’\’\’)

    def startImage(self):
        self.imagein = True

    def endImage(self):
        self.imagein = False

    def startLink(self):
        if self.imagein:
            self.fout.write(\’\’\'

           
    def endLink(self):
        self.link = self.temp
        self.temp = \’\’
        if self.imagein:
            self.fout.write(self.link.encode(\’gb2312\’))
            self.fout.write(\’\’\’\” target=\”_blank\”>\\n \’\’\’)
        elif self.item:
            #self.link = self.temp
            pass
        else:
            self.fout.write(self.link)
            self.fout.write(\’\’\’ \” target=\”
      _blank
     \”> \’\’\’)
            self.fout.write(self.title.encode(\’gb2312\’))
            self.fout.write(\’\’\’


                            \’\’\’)
            self.fout.write(self.description.encode(\’gb2312\’))
            self.fout.write(\’\’\’
                       
¸´ÖÆ´ËÒ³Á´½Ó                ÎÒҪǶÈë¸ÃÐÂÎÅÁÐ±íµ½ÎÒµÄÒ³Ãæ£¨¼òµ¥¡¢¿ìËÙ¡¢ÊµÊ±¡¢Ãâ·Ñ£©

                       
                            \’\’\’)

    def startUrl(self):
        if self.imagein:
            self.fout.write(\’\’\'    def endUrl(self):
        self.url = self.temp
        self.temp = \’\’
        if self.imagein:
            self.fout.write(self.url.encode(\’gb2312\’))
            self.fout.write(\’\’\’\” border=\”0\”>\\n
                           
                           
                           


                           
                           
                           
                       
                       
                       
                       
                        \’\’\’)

#程序入口
if __name__ == \’__main__\’:
    parse(\’ddt.xml\’, Website())

相关内容

热门资讯

Mobi、epub格式电子书如... 在wps里全局设置里有一个文件关联,打开,勾选电子书文件选项就可以了。
定时清理删除C:\Progra... C:\Program Files (x86)下面很多scoped_dir开头的文件夹 写个批处理 定...
scoped_dir32_70... 一台虚拟机C盘总是莫名奇妙的空间用完,导致很多软件没法再运行。经过仔细检查发现是C:\Program...
500 行 Python 代码... 语法分析器描述了一个句子的语法结构,用来帮助其他的应用进行推理。自然语言引入了很多意外的歧义,以我们...
小程序支付时提示:appid和... [Q]小程序支付时提示:appid和mch_id不匹配 [A]小程序和微信支付没有进行关联,访问“小...
65536是2的几次方 计算2... 65536是2的16次方:65536=2⁶ 65536是256的2次方:65536=256 6553...
pycparser 是一个用... `pycparser` 是一个用 Python 编写的 C 语言解析器。它可以用来解析 C 代码并构...
微信小程序使用slider实现... 众所周知哈,微信小程序里面的音频播放是没有进度条的,但最近有个项目呢,客户要求音频要有进度条控制,所...
Apache Doris 2.... 亲爱的社区小伙伴们,我们很高兴地向大家宣布,Apache Doris 2.0.0 版本已于...
python清除字符串里非数字... 本文实例讲述了python清除字符串里非数字字符的方法。分享给大家供大家参考。具体如下: impor...
                            \’\’\’)
        if self.item:
            #self.url = self.temp
            pass

    def defaultStart(self):
        pass
    def defaultEnd(self):
        self.temp = \’\’
    def startDescription(self):
        pass
    def endDescription(self):
        self.description = self.temp
        self.temp = \’\’
        if self.item:
            #self.fout.write(\’¡¡¡¡\’)
            self.fout.write(self.description.encode(\’gb2312\’))

    def endGuid(self):
        self.guid = self.temp
    def endPubdate(self):
        if not self.temp.startswith(\’http\’):
         self.pubdate = self.temp
         self.temp = \’\’
        else:
            self.pubdate = \’\’
    def startItem(self):
        self.item = True
    def endItem(self):
        self.item = False
        self.fout.write(\’\’\’
                           


                                    self.fout.write(self.link)
        self.fout.write(\’\’\’ \” target=\”_blank\”> \’\’\’)
        self.fout.write(self.guid)
        self.fout.write(\’\’\’
                       

                       
\’\’\’)
        self.fout.write(self.pubdate)
        self.fout.write(\’\’\'