eris- Elasticsearch Recon Ingestion Scripts (ERIS) 🔎 |
git clone git://git.acid.vegas/eris.git |
Log | Files | Refs | Archive | README | LICENSE |
ingest_httpx.py (5750B)
1 #!/usr/bin/env python 2 # Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris) 3 # ingest_httpx.py 4 5 import json 6 import logging 7 8 try: 9 import aiofiles 10 except ImportError: 11 raise ImportError('Missing required \'aiofiles\' library. (pip install aiofiles)') 12 13 14 # Set a default elasticsearch index if one is not provided 15 default_index = 'eris-httpx' 16 17 18 def construct_map() -> dict: 19 '''Construct the Elasticsearch index mapping for Masscan records.''' 20 21 # Match on exact value or full text search 22 keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } } 23 24 # Construct the index mapping 25 mapping = { 26 'mappings': { 27 'properties': { 28 'timestamp' : { 'type' : 'date' }, 29 'hash' : { 30 'properties': { 31 'body_md5' : { 'type': 'keyword' }, 32 'body_mmh3' : { 'type': 'keyword' }, 33 'body_sha256' : { 'type': 'keyword' }, 34 'body_simhash' : { 'type': 'keyword' }, 35 'header_md5' : { 'type': 'keyword' }, 36 'header_mmh3' : { 'type': 'keyword' }, 37 'header_sha256' : { 'type': 'keyword' }, 38 'header_simhash' : { 'type': 'keyword' } 39 } 40 }, 41 'port' : { 'type': 'integer' }, 42 'url' : keyword_mapping, 43 'final_url' : keyword_mapping, 44 'input' : keyword_mapping, 45 'title' : keyword_mapping, 46 'scheme' : { 'type': 'keyword' }, 47 'webserver' : { 'type': 'keyword' }, 48 'body_preview' : keyword_mapping, 49 'content_type' : { 'type': 'keyword' }, 50 'method' : { 'type': 'keyword' }, 51 'host' : { 'type': 'ip' }, 52 'path' : keyword_mapping, 53 'favicon' : { 'type': 'keyword' }, 54 'favicon_path' : keyword_mapping, 55 'a' : { 'type': 'ip' }, 56 'cname' : keyword_mapping, 57 'aaaa' : { 'type': 'ip' }, 58 'tech' : keyword_mapping, 59 'words' : { 'type': 'integer' }, 60 'lines' : { 'type': 'integer' }, 61 'status_code' : { 'type': 'integer' }, 62 'chain_status_codes' : { 'type': 'integer' }, 63 'content_length' : { 'type': 'integer' } 64 } 65 } 66 } 67 68 return mapping 69 70 71 async def process_data(input_path: str): 72 ''' 73 Read and process the input file 74 75 :param input_path: Path to the input file 76 ''' 77 78 async with aiofiles.open(input_path) as input_file: 79 # Read the input file line by line 80 async for line in input_file: 81 line = line.strip() 82 83 # Sentinel value to indicate the end of a process (for closing out a FIFO stream) 84 if line == '~eof': 85 break 86 87 # Skip empty lines 88 if not line: 89 continue 90 91 # Parse the JSON record 92 try: 93 record = json.loads(line) 94 except json.JSONDecodeError: 95 logging.error(f'Failed to parse JSON record: {line}') 96 continue 97 98 # Hacky solution to maintain ISO 8601 format without milliseconds or offsets 99 record['timestamp'] = record['timestamp'].split('.')[0] + 'Z' 100 101 # Remove unnecessary fields we don't care about 102 for item in ('failed', 'knowledgebase', 'time', 'csp'): 103 if item in record: 104 del record[item] 105 106 yield {'_index': default_index, '_source': record} 107 108 109 async def test(input_path: str): 110 ''' 111 Test the ingestion process 112 113 :param input_path: Path to the input file 114 ''' 115 116 async for document in process_data(input_path): 117 print(document) 118 119 120 121 if __name__ == '__main__': 122 import argparse 123 import asyncio 124 125 parser = argparse.ArgumentParser(description='Ingestor for ERIS') 126 parser.add_argument('input_path', help='Path to the input file or directory') 127 args = parser.parse_args() 128 129 asyncio.run(test(args.input_path)) 130 131 132 133 ''' 134 Deploy: 135 go install -v github.com/projectdiscovery/httpx/cmd/httpx@latest 136 curl -s https://public-dns.info/nameservers.txt -o nameservers.txt 137 httpx -l fulldomains.txt -t 200 -sc -location -favicon -title -bp -td -ip -cname -mc 200,201,301,302,303,307,308 -fr -r nameservers.txt -retries 2 -stream -sd -j -o fifo.json -v 138 139 Output: 140 { 141 "timestamp":"2024-01-14T13:08:15.117348474-05:00", # Rename to seen and remove milliseconds and offset 142 "hash": { # Do we need all of these ? 143 "body_md5" : "4ae9394eb98233b482508cbda3b33a66", 144 "body_mmh3" : "-4111954", 145 "body_sha256" : "89e06e8374353469c65adb227b158b265641b424fba7ddb2c67eef0c4c1280d3", 146 "body_simhash" : "9814303593401624250", 147 "header_md5" : "980366deb2b2fb5df2ad861fc63e79ce", 148 "header_mmh3" : "-813072798", 149 "header_sha256" : "39aea75ad548e38b635421861641ad1919ed3b103b17a33c41e7ad46516f736d", 150 "header_simhash" : "10962523587435277678" 151 }, 152 "port" : "443", 153 "url" : "https://supernets.org", # Remove this and only use the input field as "domain" maybe 154 "input" : "supernets.org", # rename to domain 155 "title" : "SuperNETs", 156 "scheme" : "https", 157 "webserver" : "nginx", 158 "body_preview" : "SUPERNETS Home About Contact Donate Docs Network IRC Git Invidious Jitsi LibreX Mastodon Matrix Sup", 159 "content_type" : "text/html", 160 "method" : "GET", # Remove this 161 "host" : "51.89.151.158", 162 "path" : "/", 163 "favicon" : "-674048714", 164 "favicon_path" : "/i/favicon.png", 165 "time" : "592.907689ms", # Do we need this ? 166 "a" : ["6.150.220.23"], 167 "tech" : ["Bootstrap:4.0.0", "HSTS", "Nginx"], 168 "words" : 436, # Do we need this ? 169 "lines" : 79, # Do we need this ? 170 "status_code" : 200, 171 "content_length" : 4597, 172 "failed" : false, # Do we need this ? 173 "knowledgebase" : { # Do we need this ? 174 "PageType" : "nonerror", 175 "pHash" : 0 176 } 177 } 178 '''