eris

- Elasticsearch Recon Ingestion Scripts (ERIS) 🔎
git clone git://git.acid.vegas/-c.git
Log | Files | Refs | Archive | README | LICENSE

ingest_httpx.py (3042B)

      1 #!/usr/bin/env python
      2 # Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris)
      3 # ingest_httpx.py
      4 
      5 import json
      6 
      7 try:
      8     import aiofiles
      9 except ImportError:
     10     raise ImportError('Missing required \'aiofiles\' library. (pip install aiofiles)')
     11 
     12 default_index = 'httpx-logs'
     13 
     14 def construct_map() -> dict:
     15     '''Construct the Elasticsearch index mapping for Masscan records.'''
     16 
     17     keyword_mapping = { 'type': 'text',  'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
     18 
     19     mapping = {
     20         'mappings': {
     21             'properties': {
     22                 'change': 'me'
     23             }
     24         }
     25     }
     26 
     27     return mapping
     28 
     29 
     30 async def process_data(file_path: str):
     31     '''
     32     Read and process HTTPX records from the log file.
     33 
     34     :param file_path: Path to the HTTPX log file
     35     '''
     36 
     37     async with aiofiles.open(file_path, mode='r') as input_file:
     38         async for line in input_file:
     39             line = line.strip()
     40 
     41             if not line:
     42                 continue
     43 
     44             record = json.loads(line)
     45 
     46             record['seen'] = record.pop('timestamp').split('.')[0] + 'Z' # Hacky solution to maintain ISO 8601 format without milliseconds or offsets
     47             record['domain'] = record.pop('input')
     48 
     49             del record['failed'], record['knowledgebase'], record['time']
     50 
     51             yield {'_index': default_index, '_source': record}
     52 
     53     return None # EOF
     54 
     55 
     56   
     57 ''''
     58 Example record:
     59 {
     60     "timestamp":"2024-01-14T13:08:15.117348474-05:00", # Rename to seen and remove milliseconds and offset
     61     "hash": { # Do we need all of these ?
     62         "body_md5":"4ae9394eb98233b482508cbda3b33a66",
     63         "body_mmh3":"-4111954",
     64         "body_sha256":"89e06e8374353469c65adb227b158b265641b424fba7ddb2c67eef0c4c1280d3",
     65         "body_simhash":"9814303593401624250",
     66         "header_md5":"980366deb2b2fb5df2ad861fc63e79ce",
     67         "header_mmh3":"-813072798",
     68         "header_sha256":"39aea75ad548e38b635421861641ad1919ed3b103b17a33c41e7ad46516f736d",
     69         "header_simhash":"10962523587435277678"
     70     },
     71     "port":"443",
     72     "url":"https://supernets.org", # Remove this and only use the input field as "domain" maybe
     73     "input":"supernets.org", # rename to domain
     74     "title":"SuperNETs",
     75     "scheme":"https",
     76     "webserver":"nginx",
     77     "body_preview":"SUPERNETS Home About Contact Donate Docs Network IRC Git Invidious Jitsi LibreX Mastodon Matrix Sup",
     78     "content_type":"text/html",
     79     "method":"GET", # Do we need this ?
     80     "host":"51.89.151.158",
     81     "path":"/",
     82     "favicon":"-674048714",
     83     "favicon_path":"/i/favicon.png",
     84     "time":"592.907689ms", # Do we need this ?
     85     "a":[
     86         "6.150.220.23"
     87     ],
     88     "tech":[
     89         "Bootstrap:4.0.0",
     90         "HSTS",
     91         "Nginx"
     92     ],
     93     "words":436, # Do we need this ?
     94     "lines":79, # Do we need this ?
     95     "status_code":200, 
     96     "content_length":4597,
     97     "failed":false, # Do we need this ?
     98     "knowledgebase":{ # Do we need this ?
     99         "PageType":"nonerror",
    100         "pHash":0
    101     }
    102 }
    103 '''