eris

- Elasticsearch Recon Ingestion Scripts (ERIS) 🔎
git clone git://git.acid.vegas/-c.git
Log | Files | Refs | Archive | README | LICENSE

ingest_httpx.py (2827B)

      1 #!/usr/bin/env python
      2 # Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris)
      3 # ingest_httpx.py
      4 
      5 import json
      6 
      7 default_index = 'httpx-logs'
      8 
      9 def construct_map() -> dict:
     10     '''Construct the Elasticsearch index mapping for Masscan records.'''
     11 
     12     keyword_mapping = { 'type': 'text',  'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
     13 
     14     mapping = {
     15         'mappings': {
     16             'properties': {
     17                 'change': 'me'
     18             }
     19         }
     20     }
     21 
     22     return mapping
     23 
     24 
     25 def process_file(file_path: str):
     26     '''
     27     Read and process HTTPX records from the log file.
     28 
     29     :param file_path: Path to the HTTPX log file
     30     '''
     31 
     32     with open(file_path, 'r') as file:
     33         for line in file:
     34             line = line.strip()
     35 
     36             if not line:
     37                 continue
     38 
     39             record = json.loads(line)
     40 
     41             record['seen'] = record.pop('timestamp').split('.')[0] + 'Z' # Hacky solution to maintain ISO 8601 format without milliseconds or offsets
     42             record['domain'] = record.pop('input')
     43 
     44             del record['failed'], record['knowledgebase'], record['time']
     45 
     46             yield record
     47 
     48     return None # EOF
     49 
     50 
     51   
     52 ''''
     53 Example record:
     54 {
     55     "timestamp":"2024-01-14T13:08:15.117348474-05:00", # Rename to seen and remove milliseconds and offset
     56     "hash": { # Do we need all of these ?
     57         "body_md5":"4ae9394eb98233b482508cbda3b33a66",
     58         "body_mmh3":"-4111954",
     59         "body_sha256":"89e06e8374353469c65adb227b158b265641b424fba7ddb2c67eef0c4c1280d3",
     60         "body_simhash":"9814303593401624250",
     61         "header_md5":"980366deb2b2fb5df2ad861fc63e79ce",
     62         "header_mmh3":"-813072798",
     63         "header_sha256":"39aea75ad548e38b635421861641ad1919ed3b103b17a33c41e7ad46516f736d",
     64         "header_simhash":"10962523587435277678"
     65     },
     66     "port":"443",
     67     "url":"https://supernets.org", # Remove this and only use the input field as "domain" maybe
     68     "input":"supernets.org", # rename to domain
     69     "title":"SuperNETs",
     70     "scheme":"https",
     71     "webserver":"nginx",
     72     "body_preview":"SUPERNETS Home About Contact Donate Docs Network IRC Git Invidious Jitsi LibreX Mastodon Matrix Sup",
     73     "content_type":"text/html",
     74     "method":"GET", # Do we need this ?
     75     "host":"51.89.151.158",
     76     "path":"/",
     77     "favicon":"-674048714",
     78     "favicon_path":"/i/favicon.png",
     79     "time":"592.907689ms", # Do we need this ?
     80     "a":[
     81         "6.150.220.23"
     82     ],
     83     "tech":[
     84         "Bootstrap:4.0.0",
     85         "HSTS",
     86         "Nginx"
     87     ],
     88     "words":436, # Do we need this ?
     89     "lines":79, # Do we need this ?
     90     "status_code":200, 
     91     "content_length":4597,
     92     "failed":false, # Do we need this ?
     93     "knowledgebase":{ # Do we need this ?
     94         "PageType":"nonerror",
     95         "pHash":0
     96     }
     97 }
     98 '''