Python全网页加载时间

警告
本文最后更新于 2019-02-04,文中内容可能已过时。

本文介绍的是一个关于站点全网页加载时间的一个脚本,前段时间在网络上找了很久关于在线站点全页加载的时间的,一直没有找到合适的,翻越了很久的github终于找到了一个比较适合我现在需求的一个项目,于是拿下来修改了下,目前这个有个问题是不能异步加载

参考项目:[https://github.com/donjajo/loady.git]
修改代码提交地址:[https://github.com/0x5c0f/zbx_page_load.git]
脚本依赖的额外模块: bs4(Beautiful Soup 4.x)

模块安装:
pip3 install bs4 ​(或 python3 -m pip install bs4)

检测脚本page-load.py

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
#
#UserParameter=custom.page.load[*],/opt/sh/zbx_discover_site/page-load.py $1
#
import requests
from bs4 import BeautifulSoup
import re
import urllib.parse
import sys
from time import time

debug=0

class Loady:
	files = {
		'js' : {},
		'css' : {},
		'img' : {}
	}

	def __init__( self, url, headers = {} ):
		if not isinstance( headers, dict ):
			raise ValueError( 'Headers argument must be dict instance' )

		self.url = url
		self.total_time = 0
		self.js = []
		self.css = []
		self.img = []
		self.http_headers = headers
		self.soup = None
		self.total_size = 0

	def _get( self, tag ):
		"""Gets all site additional files and prepares their URL to be loaded"""

		# Get current URL data
		domain_scheme, domain, _, _, _, _ = urllib.parse.urlparse( self.url )
		urls = []

		if tag is 'script':
			# Get all script tag with src attribute
#			print(self.soup.find_all( 'script', { 'src' : re.compile( r'.*' ) } ))
			tags = self.soup.find_all( 'script', { 'src' : re.compile( r'.*' ) } )
		elif tag is 'img':
#			print(self.soup.find_all( 'img', { 'src' : re.compile( r'.*' ) } ))
			tags = self.soup.find_all( 'img', { 'src' : re.compile( r'.*' ) } )
		# elif tag is 'i':
		# 	print(tags = self.soup.find_all('i', {'style': re.compile(r'.*')}))
		# 	tags = self.soup.find_all('i', {'style': re.compile(r'.*')})
		else:
			# Get all link tag with rel=stylesheet
#			print(self.soup.find_all( 'link', { 'rel' : 'stylesheet' } ))
			tags = self.soup.find_all( 'link', { 'rel' : 'stylesheet' } )

		for each_tag in tags:
			# Get the value of src or href
			val = each_tag[ 'src' ] if tag is 'script' or tag is 'img' else each_tag[ 'href' ]
			#val = ''
			#if tag is 'script' or tag is 'img':
			#	val = each_tag['src']
			#else:
			#	val = each_tag['href']

			# parse the URL of the gotten URL
			url = urllib.parse.urlparse( val )

			if not url[ 0 ] and url[ 1 ]:
				# If URL has no scheme but has domain name, we assume it is a URL that supports HTTP(S). We just append the main site scheme to it
				if not val.startswith("//"):
					urls.append( '{0}://{1}'.format( domain_scheme, val ) )
				else:
					urls.append( '{0}:{1}'.format( domain_scheme, val ) )
			elif not url[ 1 ]:
				# URL has no domain, its a relative path. Append the domain name to it
				if not val.startswith("/"):
					urls.append( '{0}://{1}/{2}'.format( domain_scheme, domain, val ) )
				else:
					urls.append( '{0}://{1}{2}'.format( domain_scheme, domain, val ) )
			else:
				# Its an absolute path, no issues bro!
				urls.append( val )

		if tag is 'script':
			self.js = urls
		elif tag is 'img':
			self.img = urls
		else:
			self.css = urls

	def _load( self, t ):
		"""Load the gotten links, check for response time and size. Appends it to self.files object"""
		_link_obj = []
		if t is 'script':
			_link_obj = self.js
		elif t is 'img':
			_link_obj = self.img
		else:
			_link_obj = self.css
#		for link in ( self.js if t is 'script' else self.css ):
		for link in (_link_obj):
			if debug == 1:
				print(link)
			try:
				start = time()
				r = requests.get( link )
				end = time()
				# Calculate the total time taken to load link
				response_time = ( end - start )
				# Page loaded successfully
				if r.status_code == 200:
					# Get the size of page content
					size = sys.getsizeof(r.content) if t is 'img' else sys.getsizeof(r.text)
					# Add results to self.files object
					obj = ''
					if t is 'style':
						obj = 'css'
					elif t is 'img':
						obj = 'img'
					else:
						obj = 'js'
					self.files[obj][link] = {'byte_size': size, 'load_time': response_time}
					# Sum up total time to the existing load time
					self.total_time += response_time
					self.total_size += size
			except Exception as e:
				if debug == 1:
					print(e,link)
				continue

	def get( self ):
		"""Loads the main website, calculate response time, page size and get additional files in site"""

		start = time()
		r = requests.get( self.url, headers = self.http_headers )
		stop = time()
		if r.status_code == 200:
			response = r.text
			self.total_time = self.total_time + ( stop - start )
			self.total_size += sys.getsizeof( response )
			self.soup = BeautifulSoup( response, 'html.parser' )

			self._get( 'script' )
			self._load( 'script' )
			self._get( 'style' )
			self._load( 'style' )
			self._get( 'img' )
			self._load('img')


load = Loady( sys.argv[1] , headers={ 'User-Agent' : 'zabbix pageload monitor' })
#load = Loady( sys.argv[1], headers={ 'User-Agent' : 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0' })
load.get()
#print("{TIME:\"",load.total_time,"\"}",sep='')
print("%.3f"%load.total_time)
#print( load.total_size ) # total load size
#print( load.files )   #load file and load size

由于我是用来监控的, 于是在zabbix创建了一个自动发现. 自动发现脚本discover_site.py

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#!/usr/bin/env python3
#discover site

file = open("/opt/sh/zbx_discover_site/site.txt")
print("{")
print("\t\"data\":[")
try:
	lines = file.readlines();
	count = 1
	for line in lines:
		line = line.strip("\n")
		print("\t\t{")
		print("\t\t\t\"{#SITE}\":\"",end='')
		print(line,end='')
		print("\"")
		print("\t\t}",end='')
		if count < len(lines):
			print(",")
		count = count + 1
finally:
	file.close()
print("\n\t]")
print("}")

站点配置文件site.txt

1
2
https://www.example.com
http://www.example.com

zabbix 前端配置,我是直接固定一个主机来还专门作这个的.配置成模板的需求感觉不大。

创建自动发现: 配置–主机(模板)–自动发现–创建自动发现,添加键址添加: discover.site

过滤器配置:

{#SITE} 匹配 @Linux site for autodiscovery

监控项原型配置:
名称: page load on [{#SITE}]
键值: custom.page.load[{#SITE}]
信息类型:浮点型
单位:s
数据更新时间: 300 (个人建议)

图形原型配置:
名称: page load on {#SITE}

新建系统正则表达式:
名字: Linux site for autodiscovery (与上述过滤器配置一致)
结果为真:^((http|ftp|https)://) (此处应该配置标准的url匹配规则,不过我这儿就之匹配了以http/https/ftp开头的就让他通过了)

所有配置项完成后,重启zabbix agent 就可以了,如果你是配置的模板,把模板加入到对应主机就可以了.