httpz

- Hyper-fast HTTP Scraping Tool
git clone git://git.acid.vegas/httpz.git
Log | Files | Refs | Archive | README | LICENSE

scanner.py (12476B)

      1 #!/usr/bin/env python3
      2 # HTTPZ Web Scanner - Developed by acidvegas in Python (https://github.com/acidvegas/httpz)
      3 # httpz_scanner/scanner.py
      4 
      5 import asyncio
      6 import random
      7 import urllib.parse
      8 import json
      9 
     10 try:
     11     import aiohttp
     12 except ImportError:
     13     raise ImportError('missing aiohttp module (pip install aiohttp)')
     14 
     15 try:
     16     import bs4
     17 except ImportError:
     18     raise ImportError('missing bs4 module (pip install beautifulsoup4)')
     19 
     20 from .dns     import resolve_all_dns, load_resolvers
     21 from .parsers import parse_domain_url, get_cert_info, get_favicon_hash
     22 from .utils   import debug, USER_AGENTS, input_generator
     23 
     24 
     25 class HTTPZScanner:
     26     '''Core scanner class for HTTP domain checking'''
     27     
     28     def __init__(self, concurrent_limit = 100, timeout = 5, follow_redirects = False, check_axfr = False, resolver_file = None, output_file = None, show_progress = False, debug_mode = False, jsonl_output = False, show_fields = None, match_codes = None, exclude_codes = None, shard = None, paths = None, custom_headers=None, post_data=None):
     29         '''
     30         Initialize the HTTPZScanner class
     31         
     32         :param concurrent_limit: Maximum number of concurrent requests
     33         :param timeout: Request timeout in seconds
     34         :param follow_redirects: Follow redirects
     35         :param check_axfr: Check for AXFR
     36         :param resolver_file: Path to resolver file
     37         :param output_file: Path to output file
     38         :param show_progress: Show progress bar
     39         :param debug_mode: Enable debug mode
     40         :param jsonl_output: Output in JSONL format
     41         :param show_fields: Fields to show
     42         :param match_codes: Status codes to match
     43         :param exclude_codes: Status codes to exclude
     44         :param shard: Tuple of (shard_index, total_shards) for distributed scanning
     45         :param paths: List of additional paths to check on each domain
     46         :param custom_headers: Dictionary of custom headers to send with each request
     47         :param post_data: Data to send with POST requests
     48         '''
     49 
     50         self.concurrent_limit = concurrent_limit
     51         self.timeout          = timeout
     52         self.follow_redirects = follow_redirects
     53         self.check_axfr       = check_axfr
     54         self.resolver_file    = resolver_file
     55         self.output_file      = output_file
     56         self.show_progress    = show_progress
     57         self.debug_mode       = debug_mode
     58         self.jsonl_output     = jsonl_output
     59         self.shard            = shard
     60         self.paths            = paths or []
     61         self.custom_headers   = custom_headers or {}
     62         self.post_data        = post_data
     63 
     64         self.show_fields = show_fields or {
     65             'status_code'      : True,
     66             'content_type'     : True,
     67             'content_length'   : True,
     68             'title'            : True,
     69             'body'             : True,
     70             'ip'               : True,
     71             'favicon'          : True,
     72             'headers'          : True,
     73             'follow_redirects' : True,
     74             'cname'            : True,
     75             'tls'              : True
     76         }
     77 
     78         self.match_codes       = match_codes
     79         self.exclude_codes     = exclude_codes
     80         self.resolvers         = None
     81         self.processed_domains = 0
     82         self.progress_count    = 0
     83 
     84 
     85     async def check_domain(self, session: aiohttp.ClientSession, domain: str):
     86         '''Check a single domain and return results'''
     87         base_domain, port, protocols = parse_domain_url(domain)
     88         
     89         for protocol in protocols:
     90             url = f'{protocol}{base_domain}'
     91             if port:
     92                 url += f':{port}'
     93                 
     94             try:
     95                 debug(f'Trying {url}...')
     96                 result = await self._check_url(session, url)
     97                 debug(f'Got result for {url}: {result}')
     98                 if result and (result['status'] != 400 or result.get('redirect_chain')):  # Accept redirects
     99                     return result
    100             except Exception as e:
    101                 debug(f'Error checking {url}: {str(e)}')
    102                 continue
    103         
    104         return None
    105 
    106     async def _check_url(self, session: aiohttp.ClientSession, url: str):
    107         '''Check a single URL and return results'''
    108         try:
    109             headers = {'User-Agent': random.choice(USER_AGENTS)}
    110             headers.update(self.custom_headers)
    111             
    112             debug(f'Making request to {url} with headers: {headers}')
    113             async with session.request('GET', url, 
    114                 timeout=self.timeout,
    115                 allow_redirects=True,  # Always follow redirects
    116                 max_redirects=10,
    117                 ssl=False,  # Don't verify SSL
    118                 headers=headers) as response:
    119                 
    120                 debug(f'Got response from {url}: status={response.status}, headers={dict(response.headers)}')
    121                 
    122                 result = {
    123                     'domain': urllib.parse.urlparse(url).hostname,
    124                     'status': response.status,
    125                     'url': str(response.url),
    126                     'response_headers': dict(response.headers)
    127                 }
    128                 
    129                 if response.history:
    130                     result['redirect_chain'] = [str(h.url) for h in response.history] + [str(response.url)]
    131                     debug(f'Redirect chain for {url}: {result["redirect_chain"]}')
    132                 
    133                 return result
    134                 
    135         except aiohttp.ClientSSLError as e:
    136             debug(f'SSL Error for {url}: {str(e)}')
    137             return {
    138                 'domain': urllib.parse.urlparse(url).hostname,
    139                 'status': -1,
    140                 'error': f'SSL Error: {str(e)}',
    141                 'protocol': 'https' if url.startswith('https://') else 'http',
    142                 'error_type': 'SSL'
    143             }
    144         except aiohttp.ClientConnectorCertificateError as e:
    145             debug(f'Certificate Error for {url}: {str(e)}')
    146             return {
    147                 'domain': urllib.parse.urlparse(url).hostname,
    148                 'status': -1,
    149                 'error': f'Certificate Error: {str(e)}',
    150                 'protocol': 'https' if url.startswith('https://') else 'http',
    151                 'error_type': 'CERT'
    152             }
    153         except aiohttp.ClientConnectorError as e:
    154             debug(f'Connection Error for {url}: {str(e)}')
    155             return {
    156                 'domain': urllib.parse.urlparse(url).hostname,
    157                 'status': -1,
    158                 'error': f'Connection Failed: {str(e)}',
    159                 'protocol': 'https' if url.startswith('https://') else 'http',
    160                 'error_type': 'CONN'
    161             }
    162         except aiohttp.ClientError as e:
    163             debug(f'HTTP Error for {url}: {e.__class__.__name__}: {str(e)}')
    164             return {
    165                 'domain': urllib.parse.urlparse(url).hostname,
    166                 'status': -1,
    167                 'error': f'HTTP Error: {e.__class__.__name__}: {str(e)}',
    168                 'protocol': 'https' if url.startswith('https://') else 'http',
    169                 'error_type': 'HTTP'
    170             }
    171         except asyncio.TimeoutError:
    172             debug(f'Timeout for {url}')
    173             return {
    174                 'domain': urllib.parse.urlparse(url).hostname,
    175                 'status': -1,
    176                 'error': f'Connection Timed Out after {self.timeout}s',
    177                 'protocol': 'https' if url.startswith('https://') else 'http',
    178                 'error_type': 'TIMEOUT'
    179             }
    180         except Exception as e:
    181             debug(f'Unexpected error for {url}: {e.__class__.__name__}: {str(e)}')
    182             return {
    183                 'domain': urllib.parse.urlparse(url).hostname,
    184                 'status': -1,
    185                 'error': f'Error: {e.__class__.__name__}: {str(e)}',
    186                 'protocol': 'https' if url.startswith('https://') else 'http',
    187                 'error_type': 'UNKNOWN'
    188             }
    189 
    190 
    191     async def scan(self, input_source):
    192         '''
    193         Scan domains from a file, stdin, or async generator
    194         
    195         :param input_source: Can be:
    196             - Path to file (str)
    197             - stdin ('-')
    198             - List/tuple of domains
    199             - Async generator yielding domains
    200         :yields: Result dictionary for each domain scanned
    201         '''
    202         
    203         if not self.resolvers:
    204             self.resolvers = await load_resolvers(self.resolver_file)
    205 
    206         # Just use ssl=False, that's all we need
    207         connector = aiohttp.TCPConnector(ssl=False, enable_cleanup_closed=True)
    208         async with aiohttp.ClientSession(connector=connector) as session:
    209             tasks = {}  # Change to dict to track domain for each task
    210             domain_queue = asyncio.Queue()
    211             queue_empty = False
    212             
    213             async def process_domain(domain):
    214                 try:
    215                     result = await self.check_domain(session, domain)
    216                     if self.show_progress:
    217                         self.progress_count += 1
    218                     if result:
    219                         return domain, result
    220                     else:
    221                         # Create a proper error result if check_domain returns None
    222                         return domain, {
    223                             'domain': domain,
    224                             'status': -1,
    225                             'error': 'No successful response from either HTTP or HTTPS',
    226                             'protocol': 'unknown',
    227                             'error_type': 'NO_RESPONSE'
    228                         }
    229                 except Exception as e:
    230                     debug(f'Error processing {domain}: {e.__class__.__name__}: {str(e)}')
    231                     # Return structured error information
    232                     return domain, {
    233                         'domain': domain,
    234                         'status': -1,
    235                         'error': f'{e.__class__.__name__}: {str(e)}',
    236                         'protocol': 'unknown',
    237                         'error_type': 'PROCESS'
    238                     }
    239 
    240             # Queue processor
    241             async def queue_processor():
    242                 async for domain in input_generator(input_source, self.shard):
    243                     await domain_queue.put(domain)
    244                     self.processed_domains += 1
    245                 nonlocal queue_empty
    246                 queue_empty = True
    247 
    248             # Start queue processor
    249             queue_task = asyncio.create_task(queue_processor())
    250 
    251             try:
    252                 while not (queue_empty and domain_queue.empty() and not tasks):
    253                     # Fill up tasks until we hit concurrent limit
    254                     while len(tasks) < self.concurrent_limit and not domain_queue.empty():
    255                         domain = await domain_queue.get()
    256                         task = asyncio.create_task(process_domain(domain))
    257                         tasks[task] = domain
    258                     
    259                     if tasks:
    260                         # Wait for at least one task to complete
    261                         done, _ = await asyncio.wait(
    262                             tasks.keys(),
    263                             return_when=asyncio.FIRST_COMPLETED
    264                         )
    265                         
    266                         # Process completed tasks
    267                         for task in done:
    268                             domain = tasks.pop(task)
    269                             try:
    270                                 _, result = await task
    271                                 if result:
    272                                     yield result
    273                             except Exception as e:
    274                                 debug(f'Task error for {domain}: {e.__class__.__name__}: {str(e)}')
    275                                 yield {
    276                                     'domain': domain,
    277                                     'status': -1,
    278                                     'error': f'Task Error: {e.__class__.__name__}: {str(e)}',
    279                                     'protocol': 'unknown',
    280                                     'error_type': 'TASK'
    281                                 }
    282                     else:
    283                         await asyncio.sleep(0.1)  # Prevent CPU spin when no tasks
    284 
    285             finally:
    286                 # Clean up
    287                 for task in tasks:
    288                     task.cancel()
    289                 queue_task.cancel()
    290                 try:
    291                     await queue_task
    292                 except asyncio.CancelledError:
    293                     pass