eris

- Elasticsearch Recon Ingestion Scripts (ERIS) 🔎
git clone git://git.acid.vegas/-c.git
Log | Files | Refs | Archive | README | LICENSE

ingest_httpx.py (5750B)

      1 #!/usr/bin/env python
      2 # Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris)
      3 # ingest_httpx.py
      4 
      5 import json
      6 import logging
      7 
      8 try:
      9 	import aiofiles
     10 except ImportError:
     11 	raise ImportError('Missing required \'aiofiles\' library. (pip install aiofiles)')
     12 
     13 
     14 # Set a default elasticsearch index if one is not provided
     15 default_index = 'eris-httpx'
     16 
     17 
     18 def construct_map() -> dict:
     19 	'''Construct the Elasticsearch index mapping for Masscan records.'''
     20 
     21 	# Match on exact value or full text search
     22 	keyword_mapping = { 'type': 'text',  'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
     23 
     24 	# Construct the index mapping
     25 	mapping = {
     26 		'mappings': {
     27 			'properties': {
     28 				'timestamp' : { 'type' : 'date' },
     29 				'hash'      : {
     30 					'properties': {
     31 						'body_md5'       : { 'type': 'keyword' },
     32 						'body_mmh3'      : { 'type': 'keyword' },
     33 						'body_sha256'    : { 'type': 'keyword' },
     34 						'body_simhash'   : { 'type': 'keyword' },
     35 						'header_md5'     : { 'type': 'keyword' },
     36 						'header_mmh3'    : { 'type': 'keyword' },
     37 						'header_sha256'  : { 'type': 'keyword' },
     38 						'header_simhash' : { 'type': 'keyword' }
     39 					}
     40 				},
     41 				'port'               : { 'type': 'integer' },
     42 				'url'                : keyword_mapping,
     43 				'final_url'          : keyword_mapping,
     44 				'input'              : keyword_mapping,
     45 				'title'              : keyword_mapping,
     46 				'scheme'             : { 'type': 'keyword' },
     47 				'webserver'          : { 'type': 'keyword' },
     48 				'body_preview'       : keyword_mapping,
     49 				'content_type'       : { 'type': 'keyword' },
     50 				'method'             : { 'type': 'keyword' },
     51 				'host'               : { 'type': 'ip' },
     52 				'path'               : keyword_mapping,
     53 				'favicon'            : { 'type': 'keyword' },
     54 				'favicon_path'       : keyword_mapping,
     55 				'a'                  : { 'type': 'ip' },
     56 				'cname'              : keyword_mapping,
     57 				'aaaa'               : { 'type': 'ip' },
     58 				'tech'               : keyword_mapping,
     59 				'words'              : { 'type': 'integer' },
     60 				'lines'              : { 'type': 'integer' },
     61 				'status_code'        : { 'type': 'integer' },
     62 				'chain_status_codes' : { 'type': 'integer' },
     63 				'content_length'     : { 'type': 'integer' }
     64 			}
     65 		}
     66 	}
     67 
     68 	return mapping
     69 
     70 
     71 async def process_data(input_path: str):
     72 	'''
     73 	Read and process the input file
     74 
     75 	:param input_path: Path to the input file
     76 	'''
     77 
     78 	async with aiofiles.open(input_path) as input_file:
     79 		# Read the input file line by line
     80 		async for line in input_file:
     81 			line = line.strip()
     82 
     83 			# Sentinel value to indicate the end of a process (for closing out a FIFO stream)
     84 			if line == '~eof':
     85 				break
     86 
     87 			# Skip empty lines
     88 			if not line:
     89 				continue
     90 
     91 			# Parse the JSON record
     92 			try:
     93 				record = json.loads(line)
     94 			except json.JSONDecodeError:
     95 				logging.error(f'Failed to parse JSON record: {line}')
     96 				continue
     97 
     98 		# Hacky solution to maintain ISO 8601 format without milliseconds or offsets
     99 		record['timestamp'] = record['timestamp'].split('.')[0] + 'Z'
    100 
    101 		# Remove unnecessary fields we don't care about
    102 		for item in ('failed', 'knowledgebase', 'time', 'csp'):
    103 			if item in record:
    104 				del record[item]
    105 
    106 		yield {'_index': default_index, '_source': record}
    107 
    108 
    109 async def test(input_path: str):
    110 	'''
    111 	Test the ingestion process
    112 
    113 	:param input_path: Path to the input file
    114 	'''
    115 
    116 	async for document in process_data(input_path):
    117 		print(document)
    118 
    119 
    120 
    121 if __name__ == '__main__':
    122 	import argparse
    123 	import asyncio
    124 
    125 	parser = argparse.ArgumentParser(description='Ingestor for ERIS')
    126 	parser.add_argument('input_path', help='Path to the input file or directory')
    127 	args = parser.parse_args()
    128 
    129 	asyncio.run(test(args.input_path))
    130 
    131 
    132 
    133 '''
    134 Deploy:
    135 	go install -v github.com/projectdiscovery/httpx/cmd/httpx@latest 
    136 	curl -s https://public-dns.info/nameservers.txt -o nameservers.txt
    137 	httpx -l fulldomains.txt -t 200 -sc -location -favicon -title -bp -td -ip -cname -mc 200,201,301,302,303,307,308 -fr -r nameservers.txt -retries 2 -stream -sd -j -o fifo.json -v
    138 
    139 Output:
    140 	{
    141 		"timestamp":"2024-01-14T13:08:15.117348474-05:00", # Rename to seen and remove milliseconds and offset
    142 		"hash": { # Do we need all of these ?
    143 			"body_md5"       : "4ae9394eb98233b482508cbda3b33a66",
    144 			"body_mmh3"      : "-4111954",
    145 			"body_sha256"    : "89e06e8374353469c65adb227b158b265641b424fba7ddb2c67eef0c4c1280d3",
    146 			"body_simhash"   : "9814303593401624250",
    147 			"header_md5"     : "980366deb2b2fb5df2ad861fc63e79ce",
    148 			"header_mmh3"    : "-813072798",
    149 			"header_sha256"  : "39aea75ad548e38b635421861641ad1919ed3b103b17a33c41e7ad46516f736d",
    150 			"header_simhash" : "10962523587435277678"
    151 		},
    152 		"port"           : "443",
    153 		"url"            : "https://supernets.org", # Remove this and only use the input field as "domain" maybe
    154 		"input"          : "supernets.org", # rename to domain
    155 		"title"          : "SuperNETs",
    156 		"scheme"         : "https",
    157 		"webserver"      : "nginx",
    158 		"body_preview"   : "SUPERNETS Home About Contact Donate Docs Network IRC Git Invidious Jitsi LibreX Mastodon Matrix Sup",
    159 		"content_type"   : "text/html",
    160 		"method"         : "GET", # Remove this
    161 		"host"           : "51.89.151.158",
    162 		"path"           : "/",
    163 		"favicon"        : "-674048714",
    164 		"favicon_path"   : "/i/favicon.png",
    165 		"time"           : "592.907689ms", # Do we need this ?
    166 		"a"              : ["6.150.220.23"],
    167 		"tech"           : ["Bootstrap:4.0.0", "HSTS", "Nginx"],
    168 		"words"          : 436, # Do we need this ?
    169 		"lines"          : 79, # Do we need this ?
    170 		"status_code"    : 200,
    171 		"content_length" : 4597,
    172 		"failed"         : false, # Do we need this ?
    173 		"knowledgebase"  : { # Do we need this ?
    174 			"PageType" : "nonerror",
    175 			"pHash"    : 0
    176 		}
    177 	}
    178 '''