Python全网页加载时间

2019-02-04 约 1000 字预计阅读 5 分钟

警告

本文最后更新于 2019-02-04，文中内容可能已过时。

本文介绍的是一个关于站点全网页加载时间的一个脚本,前段时间在网络上找了很久关于在线站点全页加载的时间的,一直没有找到合适的,翻越了很久的github终于找到了一个比较适合我现在需求的一个项目,于是拿下来修改了下,目前这个有个问题是不能异步加载

参考项目:[https://github.com/donjajo/loady.git]
修改代码提交地址:[~~https://github.com/0x5c0f/zbx_page_load.git~~]
脚本依赖的额外模块: bs4(Beautiful Soup 4.x)

模块安装:
pip3 install bs4 (或 python3 -m pip install bs4)

检测脚本page-load.py

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
#
#UserParameter=custom.page.load[*],/opt/sh/zbx_discover_site/page-load.py $1
#
import requests
from bs4 import BeautifulSoup
import re
import urllib.parse
import sys
from time import time

debug=1

class Loady:
    files = {
     'js' : {},
     'css' : {},
     'img' : {}
    }

    def __init__( self, url, headers = {} ):
        if not isinstance( headers, dict ):
            raise ValueError( 'Headers argument must be dict instance' )

        self.url = url
        self.total_time = 0
        self.js = []
        self.css = []
        self.img = []
        self.http_headers = headers
        self.soup = None
        self.total_size = 0

    def _get( self, tag ):
        """Gets all site additional files and prepares their URL to be loaded"""

        # Get current URL data
        domain_scheme, domain, _, _, _, _ = urllib.parse.urlparse( self.url )
        urls = []

        if tag == 'script':
            # Get all script tag with src attribute
            #			print(self.soup.find_all( 'script', { 'src' : re.compile( r'.*' ) } ))
            tags = self.soup.find_all( 'script', { 'src' : re.compile( r'.*' ) } )
        elif tag == 'img':
            #			print(self.soup.find_all( 'img', { 'src' : re.compile( r'.*' ) } ))
            tags = self.soup.find_all( 'img', { 'src' : re.compile( r'.*' ) } )
        # elif tag is 'i':
        # 	print(tags = self.soup.find_all('i', {'style': re.compile(r'.*')}))
        # 	tags = self.soup.find_all('i', {'style': re.compile(r'.*')})
        else:
            # Get all link tag with rel=stylesheet
            #			print(self.soup.find_all( 'link', { 'rel' : 'stylesheet' } ))
            tags = self.soup.find_all( 'link', { 'rel' : 'stylesheet' } )

        for each_tag in tags:
            # Get the value of src or href
            val = each_tag[ 'src' ] if tag == 'script' or tag == 'img' else each_tag[ 'href' ]
            #val = ''
            #if tag is 'script' or tag is 'img':
            #	val = each_tag['src']
            #else:
            #	val = each_tag['href']

            # parse the URL of the gotten URL
            url = urllib.parse.urlparse( val )

            if not url[ 0 ] and url[ 1 ]:
                # If URL has no scheme but has domain name, we assume it is a URL that supports HTTP(S). We just append the main site scheme to it
                if not val.startswith("//"):
                    urls.append( '{0}://{1}'.format( domain_scheme, val ) )
                else:
                    urls.append( '{0}:{1}'.format( domain_scheme, val ) )
            elif not url[ 1 ]:
                # URL has no domain, its a relative path. Append the domain name to it
                if not val.startswith("/"):
                    urls.append( '{0}://{1}/{2}'.format( domain_scheme, domain, val ) )
                else:
                    urls.append( '{0}://{1}{2}'.format( domain_scheme, domain, val ) )
            else:
                # Its an absolute path, no issues bro!
                urls.append( val )

        if tag == 'script':
            self.js = urls
        elif tag == 'img':
            self.img = urls
        else:
            self.css = urls

    def _load( self, t ):
        """Load the gotten links, check for response time and size. Appends it to self.files object"""
        _link_obj = []
        if t == 'script':
            _link_obj = self.js
        elif t == 'img':
            _link_obj = self.img
        else:
            _link_obj = self.css
#		for link in ( self.js if t is 'script' else self.css ):
        for link in (_link_obj):
            if debug == 1:
                print(link)
            try:
                start = time()
                r = requests.get( link )
                end = time()
                # Calculate the total time taken to load link
                response_time = ( end - start )
                # Page loaded successfully
                if r.status_code == 200:
                    # Get the size of page content
                    size = sys.getsizeof(r.content) if t == 'img' else sys.getsizeof(r.text)
                    # Add results to self.files object
                    obj = ''
                    if t == 'style':
                        obj = 'css'
                    elif t == 'img':
                        obj = 'img'
                    else:
                        obj = 'js'
                    self.files[obj][link] = {'byte_size': size, 'load_time': response_time}
                    # Sum up total time to the existing load time
                    self.total_time += response_time
                    self.total_size += size
            except Exception as e:
                if debug == 1:
                    print(e,link)
                continue

    def get( self ):
        """Loads the main website, calculate response time, page size and get additional files in site"""

        start = time()
        r = requests.get( self.url, headers = self.http_headers )
        stop = time()
        if r.status_code == 200:
            response = r.text
            self.total_time = self.total_time + ( stop - start )
            self.total_size += sys.getsizeof( response )
            self.soup = BeautifulSoup( response, 'html.parser' )

            self._get( 'script' )
            self._load( 'script' )
            self._get( 'style' )
            self._load( 'style' )
            self._get( 'img' )
            self._load('img')


load = Loady( sys.argv[1] , headers={ 'User-Agent' : 'zabbix pageload monitor' })
#load = Loady( sys.argv[1], headers={ 'User-Agent' : 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0' })
load.get()
#print("{TIME:\"",load.total_time,"\"}",sep='')
# print("%.3f"%load.total_time)
print( load.total_size ) # total load size
# print( load.files )   #load file and load size

由于我是用来监控的, 于是在zabbix创建了一个自动发现. 自动发现脚本discover_site.py

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#!/usr/bin/env python3
#discover site

file = open("/opt/sh/zbx_discover_site/site.txt")
print("{")
print("\t\"data\":[")
try:
	lines = file.readlines();
	count = 1
	for line in lines:
		line = line.strip("\n")
		print("\t\t{")
		print("\t\t\t\"{#SITE}\":\"",end='')
		print(line,end='')
		print("\"")
		print("\t\t}",end='')
		if count < len(lines):
			print(",")
		count = count + 1
finally:
	file.close()
print("\n\t]")
print("}")

站点配置文件site.txt

1
2
https://www.example.com
http://www.example.com

zabbix 前端配置,我是直接固定一个主机来还专门作这个的.配置成模板的需求感觉不大。

创建自动发现: 配置–主机(模板)–自动发现–创建自动发现,添加键址添加: discover.site

过滤器配置:

{#SITE} 匹配 @Linux site for autodiscovery

监控项原型配置:
名称: page load on [{#SITE}]
键值: custom.page.load[{#SITE}]
信息类型:浮点型
单位:s
数据更新时间: 300 (个人建议)

图形原型配置:
名称: page load on {#SITE}

新建系统正则表达式:
名字: Linux site for autodiscovery (与上述过滤器配置一致)
结果为真:^((http|ftp|https)://) (此处应该配置标准的url匹配规则,不过我这儿就之匹配了以http/https/ftp开头的就让他通过了)

所有配置项完成后,重启zabbix agent 就可以了,如果你是配置的模板,把模板加入到对应主机就可以了.