httpz- Hyper-fast HTTP Scraping Tool |
git clone git://git.acid.vegas/httpz.git |
Log | Files | Refs | Archive | README | LICENSE |
scanner.py (12476B)
1 #!/usr/bin/env python3 2 # HTTPZ Web Scanner - Developed by acidvegas in Python (https://github.com/acidvegas/httpz) 3 # httpz_scanner/scanner.py 4 5 import asyncio 6 import random 7 import urllib.parse 8 import json 9 10 try: 11 import aiohttp 12 except ImportError: 13 raise ImportError('missing aiohttp module (pip install aiohttp)') 14 15 try: 16 import bs4 17 except ImportError: 18 raise ImportError('missing bs4 module (pip install beautifulsoup4)') 19 20 from .dns import resolve_all_dns, load_resolvers 21 from .parsers import parse_domain_url, get_cert_info, get_favicon_hash 22 from .utils import debug, USER_AGENTS, input_generator 23 24 25 class HTTPZScanner: 26 '''Core scanner class for HTTP domain checking''' 27 28 def __init__(self, concurrent_limit = 100, timeout = 5, follow_redirects = False, check_axfr = False, resolver_file = None, output_file = None, show_progress = False, debug_mode = False, jsonl_output = False, show_fields = None, match_codes = None, exclude_codes = None, shard = None, paths = None, custom_headers=None, post_data=None): 29 ''' 30 Initialize the HTTPZScanner class 31 32 :param concurrent_limit: Maximum number of concurrent requests 33 :param timeout: Request timeout in seconds 34 :param follow_redirects: Follow redirects 35 :param check_axfr: Check for AXFR 36 :param resolver_file: Path to resolver file 37 :param output_file: Path to output file 38 :param show_progress: Show progress bar 39 :param debug_mode: Enable debug mode 40 :param jsonl_output: Output in JSONL format 41 :param show_fields: Fields to show 42 :param match_codes: Status codes to match 43 :param exclude_codes: Status codes to exclude 44 :param shard: Tuple of (shard_index, total_shards) for distributed scanning 45 :param paths: List of additional paths to check on each domain 46 :param custom_headers: Dictionary of custom headers to send with each request 47 :param post_data: Data to send with POST requests 48 ''' 49 50 self.concurrent_limit = concurrent_limit 51 self.timeout = timeout 52 self.follow_redirects = follow_redirects 53 self.check_axfr = check_axfr 54 self.resolver_file = resolver_file 55 self.output_file = output_file 56 self.show_progress = show_progress 57 self.debug_mode = debug_mode 58 self.jsonl_output = jsonl_output 59 self.shard = shard 60 self.paths = paths or [] 61 self.custom_headers = custom_headers or {} 62 self.post_data = post_data 63 64 self.show_fields = show_fields or { 65 'status_code' : True, 66 'content_type' : True, 67 'content_length' : True, 68 'title' : True, 69 'body' : True, 70 'ip' : True, 71 'favicon' : True, 72 'headers' : True, 73 'follow_redirects' : True, 74 'cname' : True, 75 'tls' : True 76 } 77 78 self.match_codes = match_codes 79 self.exclude_codes = exclude_codes 80 self.resolvers = None 81 self.processed_domains = 0 82 self.progress_count = 0 83 84 85 async def check_domain(self, session: aiohttp.ClientSession, domain: str): 86 '''Check a single domain and return results''' 87 base_domain, port, protocols = parse_domain_url(domain) 88 89 for protocol in protocols: 90 url = f'{protocol}{base_domain}' 91 if port: 92 url += f':{port}' 93 94 try: 95 debug(f'Trying {url}...') 96 result = await self._check_url(session, url) 97 debug(f'Got result for {url}: {result}') 98 if result and (result['status'] != 400 or result.get('redirect_chain')): # Accept redirects 99 return result 100 except Exception as e: 101 debug(f'Error checking {url}: {str(e)}') 102 continue 103 104 return None 105 106 async def _check_url(self, session: aiohttp.ClientSession, url: str): 107 '''Check a single URL and return results''' 108 try: 109 headers = {'User-Agent': random.choice(USER_AGENTS)} 110 headers.update(self.custom_headers) 111 112 debug(f'Making request to {url} with headers: {headers}') 113 async with session.request('GET', url, 114 timeout=self.timeout, 115 allow_redirects=True, # Always follow redirects 116 max_redirects=10, 117 ssl=False, # Don't verify SSL 118 headers=headers) as response: 119 120 debug(f'Got response from {url}: status={response.status}, headers={dict(response.headers)}') 121 122 result = { 123 'domain': urllib.parse.urlparse(url).hostname, 124 'status': response.status, 125 'url': str(response.url), 126 'response_headers': dict(response.headers) 127 } 128 129 if response.history: 130 result['redirect_chain'] = [str(h.url) for h in response.history] + [str(response.url)] 131 debug(f'Redirect chain for {url}: {result["redirect_chain"]}') 132 133 return result 134 135 except aiohttp.ClientSSLError as e: 136 debug(f'SSL Error for {url}: {str(e)}') 137 return { 138 'domain': urllib.parse.urlparse(url).hostname, 139 'status': -1, 140 'error': f'SSL Error: {str(e)}', 141 'protocol': 'https' if url.startswith('https://') else 'http', 142 'error_type': 'SSL' 143 } 144 except aiohttp.ClientConnectorCertificateError as e: 145 debug(f'Certificate Error for {url}: {str(e)}') 146 return { 147 'domain': urllib.parse.urlparse(url).hostname, 148 'status': -1, 149 'error': f'Certificate Error: {str(e)}', 150 'protocol': 'https' if url.startswith('https://') else 'http', 151 'error_type': 'CERT' 152 } 153 except aiohttp.ClientConnectorError as e: 154 debug(f'Connection Error for {url}: {str(e)}') 155 return { 156 'domain': urllib.parse.urlparse(url).hostname, 157 'status': -1, 158 'error': f'Connection Failed: {str(e)}', 159 'protocol': 'https' if url.startswith('https://') else 'http', 160 'error_type': 'CONN' 161 } 162 except aiohttp.ClientError as e: 163 debug(f'HTTP Error for {url}: {e.__class__.__name__}: {str(e)}') 164 return { 165 'domain': urllib.parse.urlparse(url).hostname, 166 'status': -1, 167 'error': f'HTTP Error: {e.__class__.__name__}: {str(e)}', 168 'protocol': 'https' if url.startswith('https://') else 'http', 169 'error_type': 'HTTP' 170 } 171 except asyncio.TimeoutError: 172 debug(f'Timeout for {url}') 173 return { 174 'domain': urllib.parse.urlparse(url).hostname, 175 'status': -1, 176 'error': f'Connection Timed Out after {self.timeout}s', 177 'protocol': 'https' if url.startswith('https://') else 'http', 178 'error_type': 'TIMEOUT' 179 } 180 except Exception as e: 181 debug(f'Unexpected error for {url}: {e.__class__.__name__}: {str(e)}') 182 return { 183 'domain': urllib.parse.urlparse(url).hostname, 184 'status': -1, 185 'error': f'Error: {e.__class__.__name__}: {str(e)}', 186 'protocol': 'https' if url.startswith('https://') else 'http', 187 'error_type': 'UNKNOWN' 188 } 189 190 191 async def scan(self, input_source): 192 ''' 193 Scan domains from a file, stdin, or async generator 194 195 :param input_source: Can be: 196 - Path to file (str) 197 - stdin ('-') 198 - List/tuple of domains 199 - Async generator yielding domains 200 :yields: Result dictionary for each domain scanned 201 ''' 202 203 if not self.resolvers: 204 self.resolvers = await load_resolvers(self.resolver_file) 205 206 # Just use ssl=False, that's all we need 207 connector = aiohttp.TCPConnector(ssl=False, enable_cleanup_closed=True) 208 async with aiohttp.ClientSession(connector=connector) as session: 209 tasks = {} # Change to dict to track domain for each task 210 domain_queue = asyncio.Queue() 211 queue_empty = False 212 213 async def process_domain(domain): 214 try: 215 result = await self.check_domain(session, domain) 216 if self.show_progress: 217 self.progress_count += 1 218 if result: 219 return domain, result 220 else: 221 # Create a proper error result if check_domain returns None 222 return domain, { 223 'domain': domain, 224 'status': -1, 225 'error': 'No successful response from either HTTP or HTTPS', 226 'protocol': 'unknown', 227 'error_type': 'NO_RESPONSE' 228 } 229 except Exception as e: 230 debug(f'Error processing {domain}: {e.__class__.__name__}: {str(e)}') 231 # Return structured error information 232 return domain, { 233 'domain': domain, 234 'status': -1, 235 'error': f'{e.__class__.__name__}: {str(e)}', 236 'protocol': 'unknown', 237 'error_type': 'PROCESS' 238 } 239 240 # Queue processor 241 async def queue_processor(): 242 async for domain in input_generator(input_source, self.shard): 243 await domain_queue.put(domain) 244 self.processed_domains += 1 245 nonlocal queue_empty 246 queue_empty = True 247 248 # Start queue processor 249 queue_task = asyncio.create_task(queue_processor()) 250 251 try: 252 while not (queue_empty and domain_queue.empty() and not tasks): 253 # Fill up tasks until we hit concurrent limit 254 while len(tasks) < self.concurrent_limit and not domain_queue.empty(): 255 domain = await domain_queue.get() 256 task = asyncio.create_task(process_domain(domain)) 257 tasks[task] = domain 258 259 if tasks: 260 # Wait for at least one task to complete 261 done, _ = await asyncio.wait( 262 tasks.keys(), 263 return_when=asyncio.FIRST_COMPLETED 264 ) 265 266 # Process completed tasks 267 for task in done: 268 domain = tasks.pop(task) 269 try: 270 _, result = await task 271 if result: 272 yield result 273 except Exception as e: 274 debug(f'Task error for {domain}: {e.__class__.__name__}: {str(e)}') 275 yield { 276 'domain': domain, 277 'status': -1, 278 'error': f'Task Error: {e.__class__.__name__}: {str(e)}', 279 'protocol': 'unknown', 280 'error_type': 'TASK' 281 } 282 else: 283 await asyncio.sleep(0.1) # Prevent CPU spin when no tasks 284 285 finally: 286 # Clean up 287 for task in tasks: 288 task.cancel() 289 queue_task.cancel() 290 try: 291 await queue_task 292 except asyncio.CancelledError: 293 pass