某些原因,一开始没有设计网站的统计模块
如今需要加上,只能借助于百度统计或者阿里云的cdn日志文件,阿里云cdn的日志文件是web的访问信息
[9/Mar/2016:00:00:16 +0800] 222.171.7.89 - 62113 \"http://cloud.insta360.com/post/5e7b029d8ed7e3c4b23006a71bab73c8?e=true&m=true\" \"GET http://cloud.insta360.com/public/media/mp4/5e7b029d8ed7e3c4b23006a71bab73c8_960x480.mp4\" 206 509 20516390 HIT \"Mozilla/5.0 (iPhone; CPU iPhone OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Mobile/12H321 NewsApp/5.3.2\" \"video/mp4\"
时间
访问IP
回源IP
responsetime
referer
method
访问URL
httpcode
requestsize
responsesize
cache命中状态
UA头
文件类型
# 将单条记录转换为Dict对象
def line2dict(line):
# Snippet, thanks to http://www.seehuhn.de/blog/52
parts = [
r\'\\[(?P
AliyunLog.py
# coding=utf-8
import fileinput
import re
import os
try:
import simplejson as json
except ImportError:
import json
# 读取输入文件并返回Dict对象
def readfile(file):
filecontent = {}
index = 0
statinfo = os.stat(file)
# just a guestimate. I believe a single entry contains atleast 150 chars
if statinfo.st_size < 150:
print \"Not a valid log file. It does not have enough data\"
else:
for line in fileinput.input(file):
index = index + 1
if line != \"\\n\": # don\'t read newlines
filecontent[index] = line2dict(line)
fileinput.close()
return filecontent
# 将单条记录转换为Dict对象
def line2dict(line):
# Snippet, thanks to http://www.seehuhn.de/blog/52
parts = [
r\'\\[(?P
main.py
#!/usr/bin/env python
# coding=utf-8
import sys
from AliyunLog import *
def main():
if len(sys.argv) < 3:
print \"Incorrect Syntax. Usage: python main.py -f \"
sys.exit(2)
elif sys.argv[1] != \"-f\":
print \"Invalid switch \'\" + sys.argv[1] + \"\'\"
sys.exit(2)
elif os.path.isfile(sys.argv[2]) == False:
print \"File does not exist\"
sys.exit(2)
print toJson(sys.argv[2])
if __name__ == \"__main__\":
main()
python main.py -f data
{
\"6432\": {
\"res_time\": \"1728\",
\"res_ip\": \"118.114.213.118\",
\"req_size\": \"768\",
\"req_url\": \"GET http://cloud.insta360.com/public/media/mp4/f9e4bf15d452440c2884b234854d089c_audio.mp3\",
\"origin_ip\": \"-\",
\"referer\": \"http://cloud.insta360.com/post/f9e4bf15d452440c2884b234854d089c?m=true&from=timeline&isappinstalled=0\",
\"content_type\": \"audio/mpeg\",
\"time\": \"9/Mar/2016:00:59:58 +0800\",
\"ua\": \"Mozilla/5.0 (iPhone; CPU iPhone OS 9_2 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13C75 MicroMessenger/6.3.13 NetType/WIFI Language/zh_CN\",
\"http_code\": \"206\",
\"res_size\": \"5290084\",
\"cache_status\": \"HIT\"
},
...
}
参考了github上apache log的解析方法
原文地址:parse-aliyun-cdn-log-file-with-python
上一篇:很好的东西
下一篇:批量下载文件(以xxx网站为例)