开发工具:python3.4
操作系统:win8
主要功能:去指定小说网页爬小说目录,按章节保存到本地,并将爬过的网页保存到本地配置文件。
被爬网站:http://www.cishuge.com/
小说名称:灵棺夜行
代码出处:本人亲自码的

import urllib.request
import http.cookiejar

import socket
import time
import re

timeout = 20
socket.setdefaulttimeout(timeout)

sleep_download_time = 10
time.sleep(sleep_download_time)
 
def makeMyOpener(head = {
 \'Connection\': \'Keep-Alive\',
 \'Accept\': \'text/html, application/xhtml+xml, */*\',
 \'Accept-Language\': \'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3\',
 \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko\'
}):
 cj = http.cookiejar.CookieJar()
 opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
 header = []
 for key, value in head.items():
  elem = (key, value)
  header.append(elem)
 opener.addheaders = header
 return opener
 
def saveFile(save_path,txts):
 f_obj = open(save_path,\'w+\')
 for item in txts:
  f_obj.write(item+\'\\n\')
 f_obj.close()
 
#get_code_list
code_list=\'http://www.cishuge.com/read/0/771/\'
oper = makeMyOpener()
uop = oper.open(code_list,timeout=1000)
data = uop.read().decode(\'gbk\',\'ignore\')

pattern = re.compile(\'
  • (.*?)
  • \',re.S) items = re.findall(pattern,data) print (\'获取列表完成\') url_path=\'url_file.txt\' url_r=open(url_path,\'r\') url_arr=url_r.readlines(100000) url_r.close() print (len(url_arr)) url_file=open(url_path,\'a\') print (\'获取已下载网址\') for tmp in items: save_path = tmp[1].replace(\' \',\'\')+\'.txt\' url = code_list+tmp[0] if url+\'\\n\' in url_arr: continue print(\'写日志:\'+url+\'\\n\') url_file.write(url+\'\\n\') opene = makeMyOpener() op1 = opene.open(url,timeout=1000) data = op1.read().decode(\'gbk\',\'ignore\') opene.close() pattern = re.compile(\'    (.*?)
    \',re.S) txts = re.findall(pattern,data) saveFile(save_path,txts) url_file.close()

    虽然代码还是有点瑕疵,还是分享给大家,一起改进