eris

- Elasticsearch Recon Ingestion Scripts (ERIS) 🔎

git clone git://git.acid.vegas/-c.git

commit d34aa105f1b4752dae1976f656b68db744fe0a9d
parent 681e3bd788bccd7fa3583a91370bee4f72946387
Author: acidvegas <acid.vegas@acid.vegas>
Date: Fri, 8 Mar 2024 00:07:26 -0500

Overall code cleanup

Diffstat:

M	ingestors/ingest_httpx.py	\|	2	+-
M	ingestors/ingest_masscan.py	\|	2	+-
M	ingestors/ingest_massdns.py	\|	8	++++----
M	ingestors/ingest_zone.py	\|	83	++++++++++++++++++++++++++++++++++++++++++-------------------------------------

4 files changed, 50 insertions(+), 45 deletions(-)

diff --git a/ingestors/ingest_httpx.py b/ingestors/ingest_httpx.py
@@ -64,7 +64,7 @@ async def process_data(file_path: str):
     :param file_path: Path to the HTTPX log file
     '''
 
-    async with aiofiles.open(file_path, mode='r') as input_file:
+    async with aiofiles.open(file_path) as input_file:
         async for line in input_file:
             line = line.strip()

diff --git a/ingestors/ingest_masscan.py b/ingestors/ingest_masscan.py
@@ -59,7 +59,7 @@ async def process_data(file_path: str):
     :param file_path: Path to the Masscan log file
     '''
 
-    async with aiofiles.open(file_path, mode='r') as input_file:
+    async with aiofiles.open(file_path) as input_file:
         async for line in input_file:
             line = line.strip()

diff --git a/ingestors/ingest_massdns.py b/ingestors/ingest_massdns.py
@@ -39,7 +39,7 @@ async def process_data(file_path: str):
     :param file_path: Path to the Massdns log file
     '''
 
-    async with aiofiles.open(file_path, mode='r') as input_file:
+    async with aiofiles.open(file_path) as input_file:
 
         last = None
 
@@ -152,7 +152,7 @@ Output:
 Input:
     {
         "_id"     : "47.229.6.0"
-        "_index"  : "ptr-records",
+        "_index"  : "eris-massdns",
         "_source" : {
             "ip"     : "47.229.6.0",
             "record" : "047-229-006-000.res.spectrum.com", # This will be a list if there are more than one PTR record
@@ -161,6 +161,6 @@ Input:
     }
 
 Notes:
-- Why do some IP addresses return a CNAME from a PTR request
-- What is dns-servfail.net (Frequent CNAME response from PTR requests)
+    Why do some IP addresses return a CNAME from a PTR request
+    What is dns-servfail.net (Frequent CNAME response from PTR requests)
 '''
 \ No newline at end of file

diff --git a/ingestors/ingest_zone.py b/ingestors/ingest_zone.py
@@ -2,6 +2,7 @@
 # Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris)
 # ingest_zone.py
 
+import logging
 import time
 
 try:
@@ -56,15 +57,15 @@ async def process_data(file_path: str):
     :param file_path: Path to the zone file
     '''
 
-    domain_records = {}
-    last_domain = None
+    async with aiofiles.open(file_path) as input_file:
+
+        last = None
 
-    async with aiofiles.open(file_path, mode='r') as input_file:
         async for line in input_file:
             line = line.strip()
 
             if line == '~eof': # Sentinel value to indicate the end of a process (Used with --watch with FIFO)
-                break
+                return last
 
             if not line or line.startswith(';'):
                 continue
@@ -72,22 +73,25 @@ async def process_data(file_path: str):
             parts = line.split()
 
             if len(parts) < 5:
-                raise ValueError(f'Invalid line: {line}')
+                logging.warning(f'Invalid line: {line}')
 
             domain, ttl, record_class, record_type, data = parts[0].rstrip('.').lower(), parts[1], parts[2].lower(), parts[3].lower(), ' '.join(parts[4:])
 
             if not ttl.isdigit():
-                raise ValueError(f'Invalid TTL: {ttl} with line: {line}')
+                logging.warning(f'Invalid TTL: {ttl} with line: {line}')
+                continue
             
             ttl = int(ttl)
 
             # Anomaly...Doubtful any CHAOS/HESIOD records will be found in zone files
             if record_class != 'in':
-                raise ValueError(f'Unsupported record class: {record_class} with line: {line}')
+                logging.warning(f'Unsupported record class: {record_class} with line: {line}')
+                continue
 
             # We do not want to collide with our current mapping (Again, this is an anomaly)
             if record_type not in record_types:
-                raise ValueError(f'Unsupported record type: {record_type} with line: {line}')
+                logging.warning(f'Unsupported record type: {record_type} with line: {line}')
+                continue
 
             # Little tidying up for specific record types (removing trailing dots, etc)
             if record_type == 'nsec':
@@ -97,26 +101,27 @@ async def process_data(file_path: str):
             elif data.endswith('.'):
                 data = data.rstrip('.')
 
-            if domain != last_domain:
-                if last_domain:
-                    struct = {
-                        'domain'  : last_domain,
-                        'records' : domain_records[last_domain],
-                        'seen'    : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) # Zone files do not contain a timestamp, so we use the current time
-                    }
-                    
-                    del domain_records[last_domain]
-
-                    yield {'_id': domain, '_index': default_index, '_source': struct} # Set the ID to the domain name to allow the record to be reindexed if it exists.
-
-                last_domain = domain
-
-                domain_records[domain] = {}
-
-            if record_type not in domain_records[domain]:
-                domain_records[domain][record_type] = []
-
-            domain_records[domain][record_type].append({'ttl': ttl, 'data': data})
+            if last:
+                if domain == last['domain']:
+                    if record_type in last['_doc']['records']:
+                        last['_doc']['records'][record_type].append({'ttl': ttl, 'data': data}) # Do we need to check for duplicate records?
+                    else:
+                        last['_doc']['records'][record_type] = [{'ttl': ttl, 'data': data}]
+                    continue
+                else:
+                    yield last
+
+            last = {
+                '_op_type' : 'update',
+                '_id'      : domain,
+                '_index'   : default_index,
+                '_doc'     : {
+                    'domain'  : domain,
+                    'records' : {record_type: [{'ttl': ttl, 'data': data}]},
+                    'seen'    : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) # Zone files do not contain a timestamp, so we use the current time
+                },
+                'doc_as_upsert' : True # This will create the document if it does not exist
+            }
 
 
 async def test(input_path: str):
@@ -144,22 +149,22 @@ if __name__ == '__main__':
 
 '''
 Output:
-    1001.vegas. 3600    in  ns  ns11.waterrockdigital.com.
-    1001.vegas. 3600    in  ns  ns12.waterrockdigital.com.
+    1001.vegas. 3600 in ns ns11.waterrockdigital.com.
+    1001.vegas. 3600 in ns ns12.waterrockdigital.com.
 
 Input:
     {
-        "_id"     : "1001.vegas"
-        "_index"  : "dns-zones",
-        "_source" : {
-            "domain"  : "1001.vegas",        
-            "records" : {
-                "ns": [
-                    {"ttl": 3600, "data": "ns11.waterrockdigital.com"},
-                    {"ttl": 3600, "data": "ns12.waterrockdigital.com"}
+        '_id'     : '1001.vegas'
+        '_index'  : 'dns-zones',
+        '_source' : {
+            'domain'  : '1001.vegas',        
+            'records' : {
+                'ns': [
+                    {'ttl': 3600, 'data': 'ns11.waterrockdigital.com'},
+                    {'ttl': 3600, 'data': 'ns12.waterrockdigital.com'}
                 ]
             },
-            "seen"    : "2021-09-01T00:00:00Z"
+            'seen'    : '2021-09-01T00:00:00Z'
         }
     }