httpz

- Hyper-fast HTTP Scraping Tool
git clone git://git.acid.vegas/httpz.git
parsers.py (4986B)
      1 #!/usr/bin/env python3
      2 # HTTPZ Web Scanner - Developed by acidvegas in Python (https://github.com/acidvegas/httpz)
      3 # httpz/parsers.py
      4 
      5 try:
      6     import bs4
      7 except ImportError:
      8     raise ImportError('missing bs4 module (pip install beautifulsoup4)')
      9 
     10 try:
     11     from cryptography import x509
     12     from cryptography.hazmat.primitives import hashes
     13     from cryptography.x509.oid import NameOID
     14 except ImportError:
     15     raise ImportError('missing cryptography module (pip install cryptography)')
     16 
     17 try:
     18     import mmh3
     19 except ImportError:
     20     raise ImportError('missing mmh3 module (pip install mmh3)')
     21 
     22 from .utils import debug, error
     23 
     24 
     25 def parse_domain_url(domain: str) -> tuple:
     26     '''
     27     Parse domain string into base domain, port, and protocol list
     28     
     29     :param domain: Raw domain string to parse
     30     :return: Tuple of (base_domain, port, protocols)
     31     '''
     32     port = None
     33     base_domain = domain.rstrip('/')
     34     
     35     if base_domain.startswith(('http://', 'https://')):
     36         protocol = 'https://' if base_domain.startswith('https://') else 'http://'
     37         base_domain = base_domain.split('://', 1)[1]
     38         if ':' in base_domain.split('/')[0]:
     39             base_domain, port_str = base_domain.split(':', 1)
     40             try:
     41                 port = int(port_str.split('/')[0])
     42             except ValueError:
     43                 port = 443 if protocol == 'https://' else 80
     44         else:
     45             port = 443 if protocol == 'https://' else 80
     46         protocols = [f'{protocol}{base_domain}{":" + str(port) if port else ""}']
     47     else:
     48         if ':' in base_domain.split('/')[0]:
     49             base_domain, port_str = base_domain.split(':', 1)
     50             port = int(port_str.split('/')[0]) if port_str.split('/')[0].isdigit() else 443
     51         else:
     52             port = 443
     53         protocols = [
     54             f'https://{base_domain}{":" + str(port) if port else ""}',
     55             f'http://{base_domain}{":"  + str(port) if port else ""}'
     56         ]
     57     
     58     return base_domain, port, protocols
     59 
     60 async def get_cert_info(ssl_object, url: str) -> dict:
     61     '''
     62     Get SSL certificate information for a domain
     63     
     64     :param ssl_object: SSL object to get certificate info from
     65     :param url: URL to get certificate info from
     66     '''
     67     try:            
     68         if not ssl_object or not (cert_der := ssl_object.getpeercert(binary_form=True)):
     69             return None
     70 
     71         cert = x509.load_der_x509_certificate(cert_der)
     72 
     73         try:
     74             san_extension = cert.extensions.get_extension_for_oid(x509.oid.ExtensionOID.SUBJECT_ALTERNATIVE_NAME)
     75             alt_names     = [name.value for name in san_extension.value] if san_extension else []
     76         except x509.extensions.ExtensionNotFound:
     77             alt_names = []
     78 
     79         try:
     80             common_name = cert.subject.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value
     81         except IndexError:
     82             common_name = None
     83 
     84         try:
     85             issuer = cert.issuer.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value
     86         except IndexError:
     87             issuer = None
     88 
     89         return {
     90             'fingerprint'   : cert.fingerprint(hashes.SHA256()).hex(),
     91             'common_name'   : common_name,
     92             'issuer'        : issuer,
     93             'alt_names'     : alt_names,
     94             'not_before'    : cert.not_valid_before_utc.isoformat(),
     95             'not_after'     : cert.not_valid_after_utc.isoformat(),
     96             'version'       : cert.version.value,
     97             'serial_number' : format(cert.serial_number, 'x'),
     98         }
     99     except Exception as e:
    100         error(f'Error getting cert info for {url}: {str(e)}')
    101         return None
    102 
    103 async def get_favicon_hash(session, base_url: str, html: str) -> str:
    104     '''
    105     Get favicon hash from a webpage
    106     
    107     :param session: aiohttp client session
    108     :param base_url: base URL of the website
    109     :param html: HTML content of the page
    110     '''
    111     try:
    112         soup = bs4.BeautifulSoup(html, 'html.parser')
    113         
    114         favicon_url = None
    115         for link in soup.find_all('link'):
    116             if link.get('rel') and any(x.lower() == 'icon' for x in link.get('rel')):
    117                 favicon_url = link.get('href')
    118                 break
    119         
    120         if not favicon_url:
    121             favicon_url = '/favicon.ico'
    122         
    123         if favicon_url.startswith('//'):
    124             favicon_url = 'https:' + favicon_url
    125         elif favicon_url.startswith('/'):
    126             favicon_url = base_url + favicon_url
    127         elif not favicon_url.startswith(('http://', 'https://')):
    128             favicon_url = base_url + '/' + favicon_url
    129 
    130         async with session.get(favicon_url, timeout=10) as response:
    131             if response.status == 200:
    132                 content    = (await response.read())[:1024*1024]
    133                 hash_value = mmh3.hash64(content)[0]
    134                 if hash_value != 0:
    135                     return str(hash_value)
    136 
    137     except Exception as e:
    138         debug(f'Error getting favicon for {base_url}: {str(e)}')
    139     
    140     return None