python实现多线程采集的2个代码例子
admin
2023-07-31 02:06:10
0

代码一:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
 
import threading
import Queue
import sys
import urllib2
import re
import MySQLdb
 
#
# 数据库变量设置
#
DB_HOST = \'127.0.0.1\'
DB_USER = \"XXXX\"
DB_PASSWD = \"XXXXXXXX\"
DB_NAME = \"xxxx\"
 
#
# 变量设置
#
THREAD_LIMIT = 3
jobs = Queue.Queue(5)
singlelock = threading.Lock()
info = Queue.Queue()
 
def workerbee(inputlist):
    for x in xrange(THREAD_LIMIT):
        print \'Thead {0} started.\'.format(x)
        t = spider()
        t.start()
    for i in inputlist:
        try:
            jobs.put(i, block=True, timeout=5)
        except:
            singlelock.acquire()
            print \"The queue is full !\"
            singlelock.release()
 
    # Wait for the threads to finish
    singlelock.acquire()        # Acquire the lock so we can print
    print \"Waiting for threads to finish.\"
    singlelock.release()        # Release the lock
    jobs.join()              # This command waits for all threads to finish.
    # while not jobs.empty():
    #   print jobs.get()
 
def getTitle(url,time=10):
    response = urllib2.urlopen(url,timeout=time)
    html = response.read()
    response.close()
    reg = r\'(.*?)\'
    title = re.compile(reg).findall(html)
    # title = title[0].decode(\'gb2312\',\'replace\').encode(\'utf-8\')
    title = title[0]
    return title
 
class spider(threading.Thread):
    def run(self):
        while 1:
            try:
                job = jobs.get(True,1)
                singlelock.acquire()
                title = getTitle(job[1])
                info.put([job[0],title], block=True, timeout=5)
                # print \'This {0} is {1}\'.format(job[1],title)
                singlelock.release()
                jobs.task_done()
            except:
                break;
 
if __name__ == \'__main__\':
    con = None
    urls = []
    try:
        con = MySQLdb.connect(DB_HOST,DB_USER,DB_PASSWD,DB_NAME)
        cur = con.cursor()
        cur.execute(\'SELECT id,url FROM `table_name` WHERE `status`=0 LIMIT 10\')
        rows = cur.fetchall()
        for row in rows:
            # print row
            urls.append([row[0],row[1]])
        workerbee(urls)
        while not info.empty():
            print info.get()
    finally:
        if con:
            con.close()

代码二:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#encoding=utf-8
#Filename:robot.py
 
import threading,Queue,sys,urllib2,re
#
# 变量设置
#
THREAD_LIMIT = 3        #设置线程数
jobs = Queue.Queue(5)      #设置队列长度
singlelock = threading.Lock()    #设置一个线程锁,避免重复调用
 
urls = [\'http://games.sina.com.cn/w/n/2013-04-28/1634703505.shtml\',\'http://games.sina.com.cn/w/n/2013-04-28/1246703487.shtml\',\'http://games.sina.com.cn/w/n/2013-04-28/1028703471.shtml\',\'http://games.sina.com.cn/w/n/2013-04-27/1015703426.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1554703373.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1512703346.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1453703334.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1451703333.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1445703329.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1434703322.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1433703321.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1433703320.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1429703318.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1429703317.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1409703297.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1406703296.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1402703292.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1353703286.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1348703284.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1327703275.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1239703265.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1238703264.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1231703262.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1229703261.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1228703260.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1223703259.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1218703258.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1202703254.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1159703251.shtml\',\'http://games.sina.com.cn/w/n/2013-04-26/1139703233.shtml\']
 
def workerbee(inputlist):
  for x in xrange(THREAD_LIMIT):
    print \'Thead {0} started.\'.format(x)
    t = spider()
    t.start()
  for i in inputlist:
    try:
      jobs.put(i, block=True, timeout=5)
    except:
      singlelock.acquire()
      print \"The queue is full !\"
      singlelock.release()
 
  # Wait for the threads to finish
  singlelock.acquire()    # Acquire the lock so we can print
  print \"Waiting for threads to finish.\"
  singlelock.release()    # Release the lock
  jobs.join()       # This command waits for all threads to finish.
  # while not jobs.empty():
  #  print jobs.get()
 
def getTitle(url,time=10):
  response = urllib2.urlopen(url,timeout=time)
  html = response.read()
  response.close()
  reg = r\'(.*?)\'
  title = re.compile(reg).findall(html)
  title = title[0].decode(\'gb2312\',\'replace\').encode(\'utf-8\')
  return title
 
class spider(threading.Thread):
  def run(self):
    while 1:
      try:
        job = jobs.get(True,1)
        singlelock.acquire()
        title = getTitle(job)
        print \'This {0} is {1}\'.format(job,title)
        singlelock.release()
        jobs.task_done()
      except:
        break;
 
if __name__ == \'__main__\':
  workerbee(urls)

相关内容

热门资讯

500 行 Python 代码... 语法分析器描述了一个句子的语法结构,用来帮助其他的应用进行推理。自然语言引入了很多意外的歧义,以我们...
定时清理删除C:\Progra... C:\Program Files (x86)下面很多scoped_dir开头的文件夹 写个批处理 定...
65536是2的几次方 计算2... 65536是2的16次方:65536=2⁶ 65536是256的2次方:65536=256 6553...
Mobi、epub格式电子书如... 在wps里全局设置里有一个文件关联,打开,勾选电子书文件选项就可以了。
scoped_dir32_70... 一台虚拟机C盘总是莫名奇妙的空间用完,导致很多软件没法再运行。经过仔细检查发现是C:\Program...
pycparser 是一个用... `pycparser` 是一个用 Python 编写的 C 语言解析器。它可以用来解析 C 代码并构...
小程序支付时提示:appid和... [Q]小程序支付时提示:appid和mch_id不匹配 [A]小程序和微信支付没有进行关联,访问“小...
微信小程序使用slider实现... 众所周知哈,微信小程序里面的音频播放是没有进度条的,但最近有个项目呢,客户要求音频要有进度条控制,所...
python绘图库Matplo... 本文简单介绍了Python绘图库Matplotlib的安装,简介如下: matplotlib是pyt...
Prometheus+Graf... 一,Prometheus概述 1,什么是Prometheus?Prometheus是最初在Sound...