httpz

- Hyper-fast HTTP Scraping Tool
git clone git://git.acid.vegas/httpz.git
Log | Files | Refs | Archive | README | LICENSE

commit e27e5e40957b91dc84e23eb7e46f4c6e0f163c38
parent a6fc596547e73c11445875a613a3f6b598f4da4a
Author: acidvegas <acid.vegas@acid.vegas>
Date: Tue, 11 Feb 2025 19:18:52 -0500

Allow any form of input for scanning

Diffstat:
MREADME.md | 152++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Mhttpz_scanner/cli.py | 26++++++--------------------
Mhttpz_scanner/parsers.py | 41+++++++++++++++++++++++++++++++++++++++--
Mhttpz_scanner/scanner.py | 8+++++---
Mhttpz_scanner/utils.py | 54+++++++++++++++++++++++++++++++++++++++++++++---------

5 files changed, 187 insertions(+), 94 deletions(-)

diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ A high-performance concurrent web scanner written in Python. HTTPZ efficiently s
 
 ## Installation
 
-### Via pip (recommended)
+### Via pip *(recommended)*
 ```bash
 # Install from PyPI
 pip install httpz_scanner
@@ -68,104 +68,136 @@ Full scan with all options:
 python -m httpz_scanner domains.txt -c 100 -o output.jsonl -j -all -to 10 -mc 200,301 -ec 404,500 -p -ax -r resolvers.txt
 ```
 
+### Distributed Scanning
+Split scanning across multiple machines using the `--shard` argument:
+
+```bash
+# Machine 1
+httpz domains.txt --shard 1/3
+
+# Machine 2
+httpz domains.txt --shard 2/3
+
+# Machine 3
+httpz domains.txt --shard 3/3
+```
+
+Each machine will process a different subset of domains without overlap. For example, with 3 shards:
+- Machine 1 processes lines 0,3,6,9,...
+- Machine 2 processes lines 1,4,7,10,...
+- Machine 3 processes lines 2,5,8,11,...
+
+This allows efficient distribution of large scans across multiple machines.
+
 ### Python Library
 ```python
 import asyncio
+import aiohttp
+import aioboto3
 from httpz_scanner import HTTPZScanner
 
 async def scan_domains():
     # Initialize scanner with all possible options (showing defaults)
     scanner = HTTPZScanner(
         # Core settings
-        concurrent_limit=100,    # Number of concurrent requests
+        concurrent_limit=100,   # Number of concurrent requests
         timeout=5,              # Request timeout in seconds
-        follow_redirects=False,  # Follow redirects (max 10)
+        follow_redirects=False, # Follow redirects (max 10)
         check_axfr=False,       # Try AXFR transfer against nameservers
         resolver_file=None,     # Path to custom DNS resolvers file
         output_file=None,       # Path to JSONL output file
         show_progress=False,    # Show progress counter
         debug_mode=False,       # Show error states and debug info
         jsonl_output=False,     # Output in JSONL format
+        shard=None,             # Tuple of (shard_index, total_shards) for distributed scanning
         
         # Control which fields to show (all False by default unless show_fields is None)
         show_fields={
-            'status_code': True,       # Show status code
-            'content_type': True,      # Show content type
-            'content_length': True,    # Show content length
+            'status_code': True,      # Show status code
+            'content_type': True,     # Show content type
+            'content_length': True,   # Show content length
             'title': True,            # Show page title
             'body': True,             # Show body preview
             'ip': True,               # Show IP addresses
             'favicon': True,          # Show favicon hash
             'headers': True,          # Show response headers
-            'follow_redirects': True,  # Show redirect chain
+            'follow_redirects': True, # Show redirect chain
             'cname': True,            # Show CNAME records
             'tls': True               # Show TLS certificate info
         },
         
         # Filter results
-        match_codes={200, 301, 302},    # Only show these status codes
-        exclude_codes={404, 500, 503}   # Exclude these status codes
+        match_codes={200,301,302},  # Only show these status codes
+        exclude_codes={404,500,503} # Exclude these status codes
     )
 
     # Initialize resolvers (required before scanning)
     await scanner.init()
 
-    # Scan domains from file
-    await scanner.scan('domains.txt')
-    
-    # Or scan from stdin
-    await scanner.scan('-')
+    # Example 1: Stream from S3/MinIO using aioboto3
+    async with aioboto3.Session().client('s3', 
+            endpoint_url='http://minio.example.com:9000',
+            aws_access_key_id='access_key',
+            aws_secret_access_key='secret_key') as s3:
+        
+        response = await s3.get_object(Bucket='my-bucket', Key='huge-domains.txt')
+        async with response['Body'] as stream:
+            async def s3_generator():
+                while True:
+                    line = await stream.readline()
+                    if not line:
+                        break
+                    yield line.decode().strip()
+            
+            await scanner.scan(s3_generator())
+
+    # Example 2: Stream from URL using aiohttp
+    async with aiohttp.ClientSession() as session:
+        # For large files - stream line by line
+        async with session.get('https://example.com/huge-domains.txt') as resp:
+            async def url_generator():
+                async for line in resp.content:
+                    yield line.decode().strip()
+            
+            await scanner.scan(url_generator())
+        
+        # For small files - read all at once
+        async with session.get('https://example.com/small-domains.txt') as resp:
+            content = await resp.text()
+            await scanner.scan(content)  # Library handles splitting into lines
+
+    # Example 3: Simple list of domains
+    domains = [
+        'example1.com',
+        'example2.com',
+        'example3.com'
+    ]
+    await scanner.scan(domains)
 
 if __name__ == '__main__':
     asyncio.run(scan_domains())
 ```
 
-The scanner will return results in this format:
-```python
-{
-    'domain': 'example.com',           # Base domain
-    'url': 'https://example.com',      # Full URL
-    'status': 200,                     # HTTP status code
-    'port': 443,                       # Port number
-    'title': 'Example Domain',         # Page title
-    'body': 'Example body text...',    # Body preview
-    'content_type': 'text/html',       # Content type
-    'content_length': '12345',         # Content length
-    'ips': ['93.184.216.34'],         # IP addresses
-    'cname': 'cdn.example.com',        # CNAME record
-    'nameservers': ['ns1.example.com'],# Nameservers
-    'favicon_hash': '123456789',       # Favicon hash
-    'headers': {                       # Response headers
-        'Server': 'nginx',
-        'Content-Type': 'text/html'
-    },
-    'redirect_chain': [               # Redirect history
-        'http://example.com',
-        'https://example.com'
-    ],
-    'tls': {                         # TLS certificate info
-        'fingerprint': 'sha256...',
-        'common_name': 'example.com',
-        'issuer': 'Let\'s Encrypt',
-        'alt_names': ['www.example.com'],
-        'not_before': '2023-01-01T00:00:00',
-        'not_after': '2024-01-01T00:00:00',
-        'version': 3,
-        'serial_number': 'abcdef1234'
-    }
-}
-```
+The scanner accepts various input types:
+- Async/sync generators that yield domains
+- String content with newlines
+- Lists/tuples of domains
+- File paths
+- stdin (using '-')
+
+All inputs support sharding for distributed scanning.
 
 ## Arguments
 
-| Argument  | Long Form        | Description                                                 |
-|-----------|------------------|-------------------------------------------------------------|
-| `file`    | -                | File containing domains *(one per line)*, use `-` for stdin |
-| `-d`      | `--debug`        | Show error states and debug information                     |
-| `-c N`    | `--concurrent N` | Number of concurrent checks *(default: 100)*                |
-| `-o FILE` | `--output FILE`  | Output file path *(JSONL format)*                           |
-| `-j`      | `--jsonl`        | Output JSON Lines format to console                         |
-| `-all`    | `--all-flags`    | Enable all output flags                                     |
+| Argument      | Long Form        | Description                                                 |
+|---------------|------------------|-------------------------------------------------------------|
+| `file`        |                  | File containing domains *(one per line)*, use `-` for stdin |
+| `-d`          | `--debug`        | Show error states and debug information                     |
+| `-c N`        | `--concurrent N` | Number of concurrent checks *(default: 100)*                |
+| `-o FILE`     | `--output FILE`  | Output file path *(JSONL format)*                           |
+| `-j`          | `--jsonl`        | Output JSON Lines format to console                         |
+| `-all`        | `--all-flags`    | Enable all output flags                                     |
+| `-sh`         | `--shard N/T`    | Process shard N of T total shards *(e.g., 1/3)*             |
 
 ### Output Field Flags
 
@@ -191,5 +223,5 @@ The scanner will return results in this format:
 | `-mc CODES` | `--match-codes CODES`   | Only show specific status codes *(comma-separated)* |
 | `-ec CODES` | `--exclude-codes CODES` | Exclude specific status codes *(comma-separated)*   |
 | `-p`        | `--progress`            | Show progress counter                               |
-| `-ax`       | `--axfr`               | Try AXFR transfer against nameservers               |
-| `-r FILE`   | `--resolvers FILE`     | File containing DNS resolvers *(one per line)*      |
-\ No newline at end of file
+| `-ax`       | `--axfr`                | Try AXFR transfer against nameservers               |
+| `-r FILE`   | `--resolvers FILE`      | File containing DNS resolvers *(one per line)*      |
+\ No newline at end of file
diff --git a/httpz_scanner/cli.py b/httpz_scanner/cli.py
@@ -11,6 +11,7 @@ import sys
 from .colors  import Colors
 from .scanner import HTTPZScanner
 from .utils   import SILENT_MODE, info
+from .parsers import parse_status_codes, parse_shard
 
 def setup_logging(level='INFO', log_to_disk=False):
     '''
@@ -49,25 +50,6 @@ def setup_logging(level='INFO', log_to_disk=False):
         handlers=handlers
     )
 
-def parse_status_codes(codes_str: str) -> set:
-    '''
-    Parse comma-separated status codes and ranges into a set of integers
-    
-    :param codes_str: Comma-separated status codes (e.g., "200,301-399,404,500-503")
-    '''
-    
-    codes = set()
-    try:
-        for part in codes_str.split(','):
-            if '-' in part:
-                start, end = map(int, part.split('-'))
-                codes.update(range(start, end + 1))
-            else:
-                codes.add(int(part))
-        return codes
-    except ValueError:
-        raise argparse.ArgumentTypeError('Invalid status code format. Use comma-separated numbers or ranges (e.g., 200,301-399,404,500-503)')
-
 async def main():
     parser = argparse.ArgumentParser(
         description=f'{Colors.GREEN}Hyper-fast HTTP Scraping Tool{Colors.RESET}',
@@ -103,6 +85,9 @@ async def main():
     parser.add_argument('-r', '--resolvers', help='File containing DNS resolvers (one per line)')
     parser.add_argument('-to', '--timeout', type=int, default=5, help='Request timeout in seconds')
     
+    # Add shard argument
+    parser.add_argument('-sh','--shard', type=parse_shard, help='Shard index and total shards (e.g., 1/3)')
+    
     # If no arguments provided, print help and exit
     if len(sys.argv) == 1:
         parser.print_help()
@@ -158,7 +143,8 @@ async def main():
             jsonl_output=args.jsonl,
             show_fields=show_fields,
             match_codes=args.match_codes,
-            exclude_codes=args.exclude_codes
+            exclude_codes=args.exclude_codes,
+            shard=args.shard
         )
 
         # Run the scanner with file/stdin input
diff --git a/httpz_scanner/parsers.py b/httpz_scanner/parsers.py
@@ -20,6 +20,7 @@ except ImportError:
     raise ImportError('missing mmh3 module (pip install mmh3)')
 
 from .utils import debug, error
+import argparse
 
 
 def parse_domain_url(domain: str) -> tuple:
@@ -108,6 +109,7 @@ async def get_favicon_hash(session, base_url: str, html: str) -> str:
     :param base_url: base URL of the website
     :param html: HTML content of the page
     '''
+
     try:
         soup = bs4.BeautifulSoup(html, 'html.parser')
         
@@ -137,4 +139,39 @@ async def get_favicon_hash(session, base_url: str, html: str) -> str:
     except Exception as e:
         debug(f'Error getting favicon for {base_url}: {str(e)}')
     
-    return None 
-\ No newline at end of file
+    return None 
+
+def parse_status_codes(codes_str: str) -> set:
+    '''
+    Parse comma-separated status codes and ranges into a set of integers
+    
+    :param codes_str: Comma-separated status codes (e.g., "200,301-399,404,500-503")
+    '''
+    
+    codes = set()
+    try:
+        for part in codes_str.split(','):
+            if '-' in part:
+                start, end = map(int, part.split('-'))
+                codes.update(range(start, end + 1))
+            else:
+                codes.add(int(part))
+        return codes
+    except ValueError:
+        raise argparse.ArgumentTypeError('Invalid status code format. Use comma-separated numbers or ranges (e.g., 200,301-399,404,500-503)')
+
+
+def parse_shard(shard_str: str) -> tuple:
+    '''
+    Parse shard argument in format INDEX/TOTAL
+    
+    :param shard_str: Shard string in format "INDEX/TOTAL"
+    '''
+
+    try:
+        shard_index, total_shards = map(int, shard_str.split('/'))
+        if shard_index < 1 or total_shards < 1 or shard_index > total_shards:
+            raise ValueError
+        return shard_index - 1, total_shards  # Convert to 0-based index
+    except (ValueError, TypeError):
+        raise argparse.ArgumentTypeError('Shard must be in format INDEX/TOTAL where INDEX <= TOTAL') 
+\ No newline at end of file
diff --git a/httpz_scanner/scanner.py b/httpz_scanner/scanner.py
@@ -27,7 +27,7 @@ from .utils      import debug, info, USER_AGENTS, input_generator
 class HTTPZScanner:
     '''Core scanner class for HTTP domain checking'''
     
-    def __init__(self, concurrent_limit = 100, timeout = 5, follow_redirects = False, check_axfr = False, resolver_file = None, output_file = None, show_progress = False, debug_mode = False, jsonl_output = False, show_fields = None, match_codes = None, exclude_codes = None):
+    def __init__(self, concurrent_limit = 100, timeout = 5, follow_redirects = False, check_axfr = False, resolver_file = None, output_file = None, show_progress = False, debug_mode = False, jsonl_output = False, show_fields = None, match_codes = None, exclude_codes = None, shard = None):
         '''
         Initialize the HTTPZScanner class
         
@@ -43,6 +43,7 @@ class HTTPZScanner:
         :param show_fields: Fields to show
         :param match_codes: Status codes to match
         :param exclude_codes: Status codes to exclude
+        :param shard: Tuple of (shard_index, total_shards) for distributed scanning
         '''
 
         self.concurrent_limit = concurrent_limit
@@ -54,6 +55,7 @@ class HTTPZScanner:
         self.show_progress    = show_progress
         self.debug_mode       = debug_mode
         self.jsonl_output     = jsonl_output
+        self.shard            = shard
 
         self.show_fields = show_fields or {
             'status_code'      : True,
@@ -218,8 +220,8 @@ class HTTPZScanner:
         async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
             tasks = set()
             
-            # Process domains with concurrent limit
-            for domain in input_generator(input_source):
+            # Pass shard info to input_generator
+            for domain in input_generator(input_source, self.shard):
                 if len(tasks) >= self.concurrent_limit:
                     done, tasks = await asyncio.wait(
                         tasks, return_when=asyncio.FIRST_COMPLETED
diff --git a/httpz_scanner/utils.py b/httpz_scanner/utils.py
@@ -3,6 +3,7 @@
 # httpz_scanner/utils.py
 
 import logging
+import os
 import sys
 
 
@@ -97,19 +98,54 @@ def human_size(size_bytes: int) -> str:
     return f'{size:.1f}{units[unit_index]}'
 
 
-def input_generator(input_source: str):
+def input_generator(input_source, shard: tuple = None):
     '''
-    Generator function to yield domains from file or stdin
+    Generator function to yield domains from various input sources with optional sharding
     
-    :param input_source: file or stdin
+    :param input_source: Can be:
+        - string path to local file
+        - "-" for stdin
+        - list/tuple of domains
+        - generator/iterator yielding domains
+        - string content with newlines
+    :param shard: Tuple of (shard_index, total_shards) for distributed scanning
     '''
     
+    line_num = 0
+    
+    # Handle stdin
     if input_source == '-' or input_source is None:
         for line in sys.stdin:
-            if line.strip():
-                yield line.strip()
-    else:
+            if line := line.strip():
+                if shard is None or line_num % shard[1] == shard[0]:
+                    yield line
+            line_num += 1
+            
+    # Handle local files
+    elif isinstance(input_source, str) and os.path.exists(input_source):
         with open(input_source, 'r') as f:
             for line in f:
-                if line.strip():
-                    yield line.strip() 
-\ No newline at end of file
+                if line := line.strip():
+                    if shard is None or line_num % shard[1] == shard[0]:
+                        yield line
+                line_num += 1
+                
+    # Handle iterables (generators, lists, etc)
+    elif hasattr(input_source, '__iter__') and not isinstance(input_source, (str, bytes)):
+        for line in input_source:
+            if isinstance(line, bytes):
+                line = line.decode()
+            if line := line.strip():
+                if shard is None or line_num % shard[1] == shard[0]:
+                    yield line
+            line_num += 1
+            
+    # Handle string content with newlines
+    elif isinstance(input_source, (str, bytes)):
+        if isinstance(input_source, bytes):
+            input_source = input_source.decode()
+        for line in input_source.splitlines():
+            if line := line.strip():
+                if shard is None or line_num % shard[1] == shard[0]:
+                    yield line
+            line_num += 1
+\ No newline at end of file