httpz- Hyper-fast HTTP Scraping Tool |
git clone git://git.acid.vegas/httpz.git |
Log | Files | Refs | Archive | README | LICENSE |
parsers.py (4986B)
1 #!/usr/bin/env python3 2 # HTTPZ Web Scanner - Developed by acidvegas in Python (https://github.com/acidvegas/httpz) 3 # httpz/parsers.py 4 5 try: 6 import bs4 7 except ImportError: 8 raise ImportError('missing bs4 module (pip install beautifulsoup4)') 9 10 try: 11 from cryptography import x509 12 from cryptography.hazmat.primitives import hashes 13 from cryptography.x509.oid import NameOID 14 except ImportError: 15 raise ImportError('missing cryptography module (pip install cryptography)') 16 17 try: 18 import mmh3 19 except ImportError: 20 raise ImportError('missing mmh3 module (pip install mmh3)') 21 22 from .utils import debug, error 23 24 25 def parse_domain_url(domain: str) -> tuple: 26 ''' 27 Parse domain string into base domain, port, and protocol list 28 29 :param domain: Raw domain string to parse 30 :return: Tuple of (base_domain, port, protocols) 31 ''' 32 port = None 33 base_domain = domain.rstrip('/') 34 35 if base_domain.startswith(('http://', 'https://')): 36 protocol = 'https://' if base_domain.startswith('https://') else 'http://' 37 base_domain = base_domain.split('://', 1)[1] 38 if ':' in base_domain.split('/')[0]: 39 base_domain, port_str = base_domain.split(':', 1) 40 try: 41 port = int(port_str.split('/')[0]) 42 except ValueError: 43 port = 443 if protocol == 'https://' else 80 44 else: 45 port = 443 if protocol == 'https://' else 80 46 protocols = [f'{protocol}{base_domain}{":" + str(port) if port else ""}'] 47 else: 48 if ':' in base_domain.split('/')[0]: 49 base_domain, port_str = base_domain.split(':', 1) 50 port = int(port_str.split('/')[0]) if port_str.split('/')[0].isdigit() else 443 51 else: 52 port = 443 53 protocols = [ 54 f'https://{base_domain}{":" + str(port) if port else ""}', 55 f'http://{base_domain}{":" + str(port) if port else ""}' 56 ] 57 58 return base_domain, port, protocols 59 60 async def get_cert_info(ssl_object, url: str) -> dict: 61 ''' 62 Get SSL certificate information for a domain 63 64 :param ssl_object: SSL object to get certificate info from 65 :param url: URL to get certificate info from 66 ''' 67 try: 68 if not ssl_object or not (cert_der := ssl_object.getpeercert(binary_form=True)): 69 return None 70 71 cert = x509.load_der_x509_certificate(cert_der) 72 73 try: 74 san_extension = cert.extensions.get_extension_for_oid(x509.oid.ExtensionOID.SUBJECT_ALTERNATIVE_NAME) 75 alt_names = [name.value for name in san_extension.value] if san_extension else [] 76 except x509.extensions.ExtensionNotFound: 77 alt_names = [] 78 79 try: 80 common_name = cert.subject.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value 81 except IndexError: 82 common_name = None 83 84 try: 85 issuer = cert.issuer.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value 86 except IndexError: 87 issuer = None 88 89 return { 90 'fingerprint' : cert.fingerprint(hashes.SHA256()).hex(), 91 'common_name' : common_name, 92 'issuer' : issuer, 93 'alt_names' : alt_names, 94 'not_before' : cert.not_valid_before_utc.isoformat(), 95 'not_after' : cert.not_valid_after_utc.isoformat(), 96 'version' : cert.version.value, 97 'serial_number' : format(cert.serial_number, 'x'), 98 } 99 except Exception as e: 100 error(f'Error getting cert info for {url}: {str(e)}') 101 return None 102 103 async def get_favicon_hash(session, base_url: str, html: str) -> str: 104 ''' 105 Get favicon hash from a webpage 106 107 :param session: aiohttp client session 108 :param base_url: base URL of the website 109 :param html: HTML content of the page 110 ''' 111 try: 112 soup = bs4.BeautifulSoup(html, 'html.parser') 113 114 favicon_url = None 115 for link in soup.find_all('link'): 116 if link.get('rel') and any(x.lower() == 'icon' for x in link.get('rel')): 117 favicon_url = link.get('href') 118 break 119 120 if not favicon_url: 121 favicon_url = '/favicon.ico' 122 123 if favicon_url.startswith('//'): 124 favicon_url = 'https:' + favicon_url 125 elif favicon_url.startswith('/'): 126 favicon_url = base_url + favicon_url 127 elif not favicon_url.startswith(('http://', 'https://')): 128 favicon_url = base_url + '/' + favicon_url 129 130 async with session.get(favicon_url, timeout=10) as response: 131 if response.status == 200: 132 content = (await response.read())[:1024*1024] 133 hash_value = mmh3.hash64(content)[0] 134 if hash_value != 0: 135 return str(hash_value) 136 137 except Exception as e: 138 debug(f'Error getting favicon for {base_url}: {str(e)}') 139 140 return None