使用Python解析阿里云CDN日志_程序人生

使用Python解析阿里云CDN日志

admin

2023-07-31 01:45:57

0次

某些原因，一开始没有设计网站的统计模块
如今需要加上，只能借助于百度统计或者阿里云的cdn日志文件，阿里云cdn的日志文件是web的访问信息

log

[9/Mar/2016:00:00:16 +0800] 222.171.7.89 - 62113 \"http://cloud.insta360.com/post/5e7b029d8ed7e3c4b23006a71bab73c8?e=true&m=true\" \"GET http://cloud.insta360.com/public/media/mp4/5e7b029d8ed7e3c4b23006a71bab73c8_960x480.mp4\" 206 509 20516390 HIT \"Mozilla/5.0 (iPhone; CPU iPhone OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Mobile/12H321 NewsApp/5.3.2\" \"video/mp4\"

fileds

时间
访问IP
回源IP
responsetime
referer
method
访问URL
httpcode
requestsize
responsesize
cache命中状态
UA头
文件类型

re

# 将单条记录转换为Dict对象
def line2dict(line):
    # Snippet, thanks to http://www.seehuhn.de/blog/52
    parts = [
        r\'\\[(?P.+)\\]\',        # 时间 %t
        r\'(?P\\S+)\',         # 访问IP %h
        r\'(?P\\S+)\',      # 回源IP %h
        r\'(?P[0-9]+)\',    # 响应时间 %>s
        r\'\"(?P.*)\"\',       # Referer \"%{Referer}i\"
        r\'\"(?P.+)\"\',       # 请求地址 \"%r\"
        r\'(?P[0-9]+)\',   # Httpcode %>s
        r\'(?P\\S+)\',       # 请求大小 %b (careful, can be \'-\')
        r\'(?P[0-9]+)\',    # 响应大小 size %>s
        r\'(?P\\S+)\',   # 缓存状态 %s
        r\'\"(?P.*)\"\',            # user agent \"%{User-agent}i\"
        r\'\"(?P.*)\"\',  # content type \"%{Content-type}i\"
    ]
    pattern = re.compile(r\'\\s+\'.join(parts) + r\'\\s*\\Z\')
    m = pattern.match(line)
    res = m.groupdict()
    return res

script

AliyunLog.py

# coding=utf-8

import fileinput
import re
import os

try:
    import simplejson as json
except ImportError:
    import json


# 读取输入文件并返回Dict对象
def readfile(file):
    filecontent = {}
    index = 0
    statinfo = os.stat(file)

    # just a guestimate. I believe a single entry contains atleast 150 chars
    if statinfo.st_size < 150:
        print \"Not a valid log file. It does not have enough data\"
    else:
        for line in fileinput.input(file):
            index = index + 1
            if line != \"\\n\":  # don\'t read newlines
                filecontent[index] = line2dict(line)

        fileinput.close()
    return filecontent


# 将单条记录转换为Dict对象
def line2dict(line):
    # Snippet, thanks to http://www.seehuhn.de/blog/52
    parts = [
        r\'\\[(?P.+)\\]\',        # 时间 %t
        r\'(?P\\S+)\',         # 访问IP %h
        r\'(?P\\S+)\',      # 回源IP %h
        r\'(?P[0-9]+)\',    # 响应时间 %>s
        r\'\"(?P.*)\"\',       # Referer \"%{Referer}i\"
        r\'\"(?P.+)\"\',       # 请求地址 \"%r\"
        r\'(?P[0-9]+)\',   # Httpcode %>s
        r\'(?P\\S+)\',       # 请求大小 %b (careful, can be \'-\')
        r\'(?P[0-9]+)\',    # 响应大小 size %>s
        r\'(?P\\S+)\',   # 缓存状态 %s
        r\'\"(?P.*)\"\',            # user agent \"%{User-agent}i\"
        r\'\"(?P.*)\"\',  # content type \"%{Content-type}i\"
    ]
    pattern = re.compile(r\'\\s+\'.join(parts) + r\'\\s*\\Z\')
    m = pattern.match(line)
    res = m.groupdict()
    return res


# 转换整个记录为Json对象
def toJson(file):
    entries = readfile(file)
    return json.JSONEncoder(indent=4).encode(entries)

main.py

#!/usr/bin/env python
# coding=utf-8

import sys
from AliyunLog import *

def main():
    if len(sys.argv) < 3:
        print \"Incorrect Syntax. Usage: python main.py -f \"
        sys.exit(2)
    elif sys.argv[1] != \"-f\":
        print \"Invalid switch \'\" + sys.argv[1] + \"\'\"
        sys.exit(2)
    elif os.path.isfile(sys.argv[2]) == False:
        print \"File does not exist\"
        sys.exit(2)

    print toJson(sys.argv[2])


if __name__ == \"__main__\":
    main()

result

run script

python main.py -f data

terminal


{
    \"6432\": {
        \"res_time\": \"1728\", 
        \"res_ip\": \"118.114.213.118\", 
        \"req_size\": \"768\", 
        \"req_url\": \"GET http://cloud.insta360.com/public/media/mp4/f9e4bf15d452440c2884b234854d089c_audio.mp3\", 
        \"origin_ip\": \"-\", 
        \"referer\": \"http://cloud.insta360.com/post/f9e4bf15d452440c2884b234854d089c?m=true&from=timeline&isappinstalled=0\", 
        \"content_type\": \"audio/mpeg\", 
        \"time\": \"9/Mar/2016:00:59:58 +0800\", 
        \"ua\": \"Mozilla/5.0 (iPhone; CPU iPhone OS 9_2 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13C75 MicroMessenger/6.3.13 NetType/WIFI Language/zh_CN\", 
        \"http_code\": \"206\", 
        \"res_size\": \"5290084\", 
        \"cache_status\": \"HIT\"
    }，
    ...
}

参考了github上apache log的解析方法
原文地址：parse-aliyun-cdn-log-file-with-python

python

上一篇：很好的东西

下一篇：批量下载文件（以xxx网站为例）

使用Python解析阿里云CDN日志

log

fileds

re

script

result

run script

terminal

more

相关内容

热门资讯