python 写的一个爬虫程序源码_程序人生

python 写的一个爬虫程序源码

admin
2023-07-31 02:31:52
0次
写爬虫是一项复杂、枯噪、反复的工作，考虑的问题包括采集效率、链路异常处理、数据质量(与站点编码规范关系很大)等。整理自己写一个爬虫程序，单台服务器可以启用1~8个实例同时采集，然后将数据入库。
#-*- coding:utf-8 -*-
#!/usr/local/bin/python
import sys, time, os,string
import mechanize
import urlparse
from BeautifulSoup import BeautifulSoup
import re
import MySQLdb
import logging
import cgi
from optparse import OptionParser
#----------------------------------------------------------------------------#
# Name:    TySpider.py                              #
# Purpose:   WebSite Spider Module                     #
# Author:   刘天斯                                   #
# Email:    liutiansi@gamil.com                         #
# Created:   2010/02/16                              #
# Copyright:  (c) 2010                                #
#----------------------------------------------------------------------------#


\"\"\"
|--------------------------------------------------------------------------
| 定义 loging class;
|--------------------------------------------------------------------------
|
| 功能：记录系统相关日志信息。
| 
|
\"\"\"
class Pubclilog():
  def __init__(self):
    self.logfile = \'website_log.txt\'

  def iniLog(self):
    logger = logging.getLogger()
    filehandler = logging.FileHandler(self.logfile)
    streamhandler = logging.StreamHandler()
    fmt = logging.Formatter(\'%(asctime)s, %(funcName)s, %(message)s\')
    logger.setLevel(logging.DEBUG) 
    logger.addHandler(filehandler) 
    logger.addHandler(streamhandler)
    return [logger,filehandler]


\"\"\"
|--------------------------------------------------------------------------
| 定义 tySpider class;
|--------------------------------------------------------------------------
|
| 功能：抓取分类、标题等信息
| 
|
\"\"\"
class BaseTySpider:

  #初始化相关成员方法
  def __init__(self,X,log_switch):

    #数据库连接
    self.conn = MySQLdb.connect(db=\'dbname\',host=\'192.168.0.10\', user=\'dbuser\',passwd=\'SDFlkj934y5jsdgfjh435\',charset=\'utf8\')

    #分类及标题页面Community
    self.CLASS_URL = \'http://test.abc.com/aa/CommTopicsPage?\'

    #发表回复页
    self.Content_URL = \'http://test.bac.com/aa/CommMsgsPage?\'

    #开始comm值
    self.X=X

    #当前comm id取模，方面平均到表
    self.mod=self.X%5

    #Community文件下载页
    self.body=\"\"

    #self.bodySoup对象
    self.soup=None

    #发表回复页下载内容变量
    self.Contentbody=\"\"

    #发表回复页内容self.ContentbodySoup对象
    self.Contentsoup=None

    #日志开关
    self.log_switch=log_switch


  #======================获取名称及分类方法==========================
  def _SpiderClass(self,nextpage=None):
    if nextpage==None:
      FIXED_QUERY = \'cmm=\'+str(self.X)
    else:
      FIXED_QUERY = nextpage[1:]

    try:
      rd = mechanize.Browser()
      rd.addheaders = [(\"User-agent\", \"Tianya/2010 (compatible; MSIE 6.0;Windows NT 5.1)\")]
      rd.open(self.CLASS_URL + FIXED_QUERY)
      self.body=rd.response().read()
      #rd=mechanize.Request(self.CLASS_URL + FIXED_QUERY)
      #response = mechanize.urlopen(rd)
      #self.body=response.read()

    except Exception,e:
      if self.log_switch==\"on\":
        logapp=Pubclilog()
        logger,hdlr = logapp.iniLog()
        logger.info(self.CLASS_URL + FIXED_QUERY+str(e))
        hdlr.flush()
        logger.removeHandler(hdlr)
        return
    self.soup = BeautifulSoup(self.body)
    NextPageObj= self.soup(\"a\", {\'class\' : re.compile(\"fs-paging-item fs-paging-next\")})
    self.cursor = self.conn.cursor()
    if nextpage==None:
      try:
        Ttag=str(self.soup.table)
        #print Ttag

        \"\"\"
        ------------------分析结构体-----------------
        
          
            
              Dunhill
            
            
              
                  中国 » 人民
              
            
          
        
        \"\"\"

        soupTable=BeautifulSoup(Ttag)

        #定位到第一个h1标签
        tableh1 = soupTable(\"h1\")
        #print self.X
        #print \"Name:\"+tableh1[0].string.strip().encode(\'utf-8\')

        #处理无类型的
        try:
          #定位到表格中符合规则“^TopByCategory”A链接块，tablea[0]为第一个符合条件的连接文字，tablea[1]...
          tablea = soupTable(\"a\", {\'href\' : re.compile(\"^TopByCategory\")})
          if tablea[0].string.strip()==\"\":
            pass
          #print \"BigCLass:\"+tablea[0].string.strip().encode(\'utf-8\')
          #print \"SubClass:\"+tablea[1].string.strip().encode(\'utf-8\')
        except Exception,e:
          if self.log_switch==\"on\":
            logapp=Pubclilog()
            logger,hdlr = logapp.iniLog()
            logger.info(\"[noClassInfo]\"+str(self.X)+str(e))
            hdlr.flush()
            logger.removeHandler(hdlr)
          self.cursor.execute(\"insert into baname\"+str(self.mod)+\" values(\'%d\',\'%d\',\'%s\')\" %(self.X,-1,tableh1[0].string.strip().encode(\'utf-8\')))
          self.conn.commit() 
          self._SpiderTitle()
          if NextPageObj:
            NextPageURL=NextPageObj[0][\'href\']
            self._SpiderClass(NextPageURL)
            return
          else:
            return


        #获取链接二对象的href值
        classlink=tablea[1][\'href\']
        par_dict=cgi.parse_qs(urlparse.urlparse(classlink).query)
        #print \"CID:\"+par_dict[\"cid\"][0]
        #print \"SubCID:\"+par_dict[\"subcid\"][0]
        #print \"---------------------------------------\"
        
        #插入数据库
        self.cursor.execute(\"insert into class values(\'%d\',\'%s\')\" %(int(par_dict[\"cid\"][0]),tablea[0].string.strip().encode(\'utf-8\')))
        self.cursor.execute(\"insert into subclass values(\'%d\',\'%d\',\'%s\')\" %(int(par_dict[\"subcid\"][0]),int(par_dict[\"cid\"][0]),tablea[1].string.strip().encode(\'utf-8\'))) 
        self.cursor.execute(\"insert into baname\"+str(self.mod)+\" values(\'%d\',\'%d\',\'%s\')\" %(self.X,int(par_dict[\"subcid\"][0]),tableh1[0].string.strip().encode(\'utf-8\')))
        self.conn.commit() 
        
        self._SpiderTitle()
        if NextPageObj:
          NextPageURL=NextPageObj[0][\'href\']
          self._SpiderClass(NextPageURL)

        self.body=None
        self.soup=None
        Ttag=None
        soupTable=None
        table=None
        table1=None
        classlink=None
        par_dict=None
      except Exception,e:
        if self.log_switch==\"on\":
          logapp=Pubclilog()
          logger,hdlr = logapp.iniLog()
          logger.info(\"[ClassInfo]\"+str(self.X)+str(e))
          hdlr.flush()
          logger.removeHandler(hdlr)
    else:
      self._SpiderTitle()
      if NextPageObj:
        NextPageURL=NextPageObj[0][\'href\']
        self._SpiderClass(NextPageURL)




  #====================获取标题方法=========================
  def _SpiderTitle(self):
    #查找标题表格对象(table)
    soupTitleTable=self.soup(\"table\", {\'class\' : \"fs-topic-list\"})
    
    #查找标题行对象(tr)
    TitleTr = soupTitleTable[0](\"tr\", {\'onmouseover\' : re.compile(\"^this\\.className=\'fs-row-hover\'\")})
    \"\"\"
    -----------分析结构体--------------
    
      
        
      
      
        【新人报到】欢迎美国人民加入
        
         0
         /
          12
      
      
        
      
      
        中国人
      
      2-14
    
    \"\"\"
    for CurrTr in TitleTr:
      try:
        #初始化置顶及精华状态
        Title_starred=\'N\'
        Title_sticky=\'N\'

        #获取当前记录的BeautifulSoup对象
        soupCurrTr=BeautifulSoup(str(CurrTr))

        #BeautifulSoup分析HTML有误，只能通过span的标志数来获取贴子状态，会存在一定误差
        #如只有精华时也会当成置顶来处理。
        TitleStatus=soupCurrTr(\"span\", {\'title\' : \"\"})
        TitlePhotoViewer=soupCurrTr(\"a\", {\'href\' : re.compile(\"^PhotoViewer\")})
        if TitlePhotoViewer.__len__()==1:
          TitlePhotoViewerBool=0
        else:
          TitlePhotoViewerBool=1
        if TitleStatus.__len__()==3-TitlePhotoViewerBool:
          Title_starred=\'Y\'
          Title_sticky=\'Y\'
        elif TitleStatus.__len__()==2-TitlePhotoViewerBool:
          Title_sticky=\'Y\'

        #获取贴子标题
        Title=soupCurrTr.a.next.strip()

        #获取贴子ID
        par_dict=cgi.parse_qs(urlparse.urlparse(soupCurrTr.a[\'href\']).query)

        #获取回复数及浏览器
        TitleNum=soupCurrTr(\"td\", {\'class\' : \"fs-topic-name\"})
        TitleArray=string.split(str(TitleNum[0]),\'\\n\')
        Title_ReplyNum=string.split(TitleArray[len(TitleArray)-4],\'>\')[2]
        Title_ViewNum=string.split(TitleArray[len(TitleArray)-2],\'>\')[2][:-6]

        #获取贴子作者
        TitleAuthorObj=soupCurrTr(\"td\", {\'style\' : \"padding-left:4px\"})
        Title_Author=TitleAuthorObj[0].next.next.next.string.strip().encode(\'utf-8\')


        #获取回复时间
        TitleTime=soupCurrTr(\"td\", {\'class\' : re.compile(\"^fs-topic-last-mdfy fs-meta\")})

        \"\"\"
        print \"X:\"+str(self.X)
        print \"Title_starred:\"+Title_starred
        print \"Title_sticky:\"+Title_sticky
        print \"Title:\"+Title

        #获取贴子内容连接URL
        print \"Title_link:\"+soupCurrTr.a[\'href\']
        print \"CID:\"+par_dict[\"tid\"][0]
        print \"Title_ReplyNum:\"+Title_ReplyNum
        print \"Title_ViewNum:\"+Title_ViewNum
        print \"Title_Author:\"+Title_Author
        print \"TitleTime:\"+TitleTime[0].string.strip().encode(\'utf-8\')
        \"\"\"

        #入库
        self.cursor.execute(\"insert into Title\"+str(self.mod)+\" values(\'%s\',\'%d\',\'%s\',\'%d\',\'%d\',\'%s\',\'%s\',\'%s\',\'%s\')\" %(par_dict[\"tid\"][0], \\
                                     self.X,Title,int(Title_ReplyNum),int(Title_ViewNum),Title_starred,Title_sticky, \\
                                     Title_Author.decode(\'utf-8\'),TitleTime[0].string.strip().encode(\'utf-8\')))
        self.conn.commit()
        self._SpiderContent(par_dict[\"tid\"][0])
      except Exception,e:
        if self.log_switch==\"on\":
          logapp=Pubclilog()
          logger,hdlr = logapp.iniLog()
          logger.info(\"[Title]\"+str(self.X)+\'-\'+par_dict[\"tid\"][0]+\'-\'+str(e))
          hdlr.flush()
          logger.removeHandler(hdlr)


  #======================获取发表及回复方法=======================
  def _SpiderContent(self,ID,nextpage=None):
    if nextpage==None:
      FIXED_QUERY = \'cmm=\'+str(self.X)+\'&tid=\'+ID+\'&ref=regulartopics\'
    else:
      FIXED_QUERY = nextpage[9:]

    rd = mechanize.Browser()
    rd.addheaders = [(\"User-agent\", \"Tianya/2010 (compatible; MSIE 6.0;Windows NT 5.1)\")]
    rd.open(self.Content_URL + FIXED_QUERY)
    self.Contentbody=rd.response().read()

    #rd=mechanize.Request(self.Content_URL + FIXED_QUERY)
    #response = mechanize.urlopen(rd)
    #self.Contentbody=response.read()

    self.Contentsoup = BeautifulSoup(self.Contentbody)
    NextPageObj= self.Contentsoup(\"a\", {\'class\' : re.compile(\"fs-paging-item fs-paging-next\")})
    try:
      Tdiv=self.Contentsoup(\"div\", {\'class\' : \"fs-user-action\"})
      i=0
      for Currdiv in Tdiv:
        if i==0:
          Ctype=\'Y\'
        else:
          Ctype=\'N\'
        #发表时间
        soupCurrdiv=BeautifulSoup(str(Currdiv))
        PosttimeObj=soupCurrdiv(\"span\", {\'class\' : \"fs-meta\"})
        Posttime=PosttimeObj[0].next[1:]
        Posttime=Posttime[0:-3]
        #IP地址
        IPObj=soupCurrdiv(\"a\", {\'href\' : re.compile(\"CommMsgAddress\")})
        if IPObj:
          IP=IPObj[0].next.strip()
        else:
          IP=\'\'


        #发表／回复内容
        ContentObj=soupCurrdiv(\"div\", {\'class\' :\"fs-user-action-body\"})
        Content=ContentObj[0].renderContents().strip()
        \"\"\"
        print \"ID:\"+str(self.X)
        print \"ID:\"+ID
        print \"Ctype:\"+Ctype
        print \"POSTTIME:\"+Posttime
        print \"IP:\"+IP
        print \"Content:\"+Content
        \"\"\"

        self.cursor.execute(\"insert into Content\"+str(self.mod)+\" values(\'%s\',\'%d\',\'%s\',\'%s\',\'%s\',\'%s\')\" %(ID,self.X,Ctype,Posttime,IP,Content.decode(\'utf-8\')))
        self.conn.commit() 
        i+=1

    except Exception,e:
      if self.log_switch==\"on\":
        logapp=Pubclilog()
        logger,hdlr = logapp.iniLog()
        logger.info(\"[Content]\"+str(self.X)+\'-\'+ID+\'-\'+str(e))
        hdlr.flush()
        logger.removeHandler(hdlr)

    #如“下一页”有链接刚继续遍历
    if NextPageObj:
      NextPageURL=NextPageObj[0][\'href\']
      self._SpiderContent(ID,NextPageURL)


  def __del__(self):
    try:
      self.cursor.close()
      self.conn.close()
    except Exception,e:
      pass

#遍历comm范围
def initapp(StartValue,EndValue,log_switch):
  for x in range(StartValue,EndValue): 
    app=BaseTySpider(x,log_switch)
    app._SpiderClass()
    app=None

if __name__ == \"__main__\":
  
  #定义命令行参数
  MSG_USAGE = \"TySpider.py [ -s StartNumber EndNumber ] -l [on|off] [-v][-h]\"
  parser = OptionParser(MSG_USAGE)
  parser.add_option(\"-s\", \"--set\", nargs=2,action=\"store\", 
      dest=\"comm_value\",
      type=\"int\",
      default=False, 
      help=\"配置名称ID值范围。\".decode(\'utf-8\'))
  parser.add_option(\"-l\", \"--log\", action=\"store\", 
      dest=\"log_switch\",
      type=\"string\",
      default=\"on\", 
      help=\"错误日志开关\".decode(\'utf-8\'))
  parser.add_option(\"-v\",\"--version\", action=\"store_true\", dest=\"verbose\",
      help=\"显示版本信息\".decode(\'utf-8\'))
  opts, args = parser.parse_args()

  if opts.comm_value:
    if opts.comm_value[0]>opts.comm_value[1]:
      print \"终止值比起始值还小？\"
      exit();
    if opts.log_switch==\"on\":
      log_switch=\"on\"
    else:
      log_switch=\"off\"
    initapp(opts.comm_value[0],opts.comm_value[1],log_switch)
    exit();

  if opts.verbose:
    print \"WebSite Scider V1.0 beta.\"
    exit;
python 爬虫程序
上一篇：Python基础语法(Python基础知识点)
下一篇：简要讲解Python编程中线程的创建与锁的使用
python 写的一个爬虫程序源码

Dunhill

相关内容

热门资讯