diff --git a/README.md b/README.md
@@ -24,14 +24,14 @@ python eris.py [options] <input>
| `--watch` | Create or watch a FIFO for real-time indexing |
###### Elasticsearch arguments
-| Argument | Description | Default |
-|-----------------|---------------------------------------------------------|---------------------|
-| `--host` | Elasticsearch host | `http://localhost/` |
-| `--port` | Elasticsearch port | `9200` |
-| `--user` | Elasticsearch username | `elastic` |
-| `--password` | Elasticsearch password | `$ES_PASSWORD` |
-| `--api-key` | Elasticsearch API Key for authentication | `$ES_APIKEY` |
-| `--self-signed` | Elasticsearch connection with a self-signed certificate | |
+| Argument | Description | Default |
+|-----------------|---------------------------------------------------------|--------------------|
+| `--host` | Elasticsearch host | `http://localhost` |
+| `--port` | Elasticsearch port | `9200` |
+| `--user` | Elasticsearch username | `elastic` |
+| `--password` | Elasticsearch password | `$ES_PASSWORD` |
+| `--api-key` | Elasticsearch API Key for authentication | `$ES_APIKEY` |
+| `--self-signed` | Elasticsearch connection with a self-signed certificate | |
###### Elasticsearch indexing arguments
| Argument | Description | Default |
@@ -50,20 +50,23 @@ python eris.py [options] <input>
| `--timeout` | Number of seconds to wait before retrying a chunk | `60` |
###### Ingestion arguments
-| Argument | Description |
-|-------------|--------------------------|
-| `--certs` | Index Certstream records |
-| `--httpx` | Index HTTPX records |
-| `--masscan` | Index Masscan records |
-| `--massdns` | Index massdns records |
-| `--zone` | Index zone DNS records |
+| Argument | Description |
+|---------------|--------------------------|
+| `--certstrem` | Index Certstream records |
+| `--httpx` | Index HTTPX records |
+| `--masscan` | Index Masscan records |
+| `--massdns` | Index massdns records |
+| `--zone` | Index zone DNS records |
-This ingestion suite will use the built in node sniffer, so by connecting to a single node, you can load balance across the entire cluster.
-It is good to know how much nodes you have in the cluster to determine how to fine tune the arguments for the best performance, based on your environment.
+~~This ingestion suite will use the built in node sniffer, so by connecting to a single node, you can load balance across the entire cluster.~~
+
+**Note:** The sniffer has been disabled for now due an [issue](https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960) with the 8.x elasticsearch client. The auth headers are not properly sent when enabling the sniffer. A working [patch](https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960) was shared and has been *mostly* converted in [helpers/sniff_patch.py](./helpers/sniff_patch.py) for the async client.
## Roadmap
- Create a module for RIR database ingestion *(WHOIS, delegations, transfer, ASN mapping, peering, etc)*
- Dynamically update the batch metrics when the sniffer adds or removes nodes.
+- Fix issue with leftover FIFO files *(catch SIGTERM / SIGINT signals)*
+- Create a working patch for the async client to properly send auth headers.
___
diff --git a/eris.py b/eris.py
@@ -12,240 +12,264 @@ import sys
sys.dont_write_bytecode = True
try:
- # This is commented out because there is a bug with the elasticsearch library that requires a patch (see initialize() method below)
- #from elasticsearch import AsyncElasticsearch
- from elasticsearch.exceptions import NotFoundError
- from elasticsearch.helpers import async_streaming_bulk
+ from elasticsearch import AsyncElasticsearch
+ from elasticsearch.exceptions import NotFoundError
+ from elasticsearch.helpers import async_streaming_bulk
except ImportError:
- raise ImportError('Missing required \'elasticsearch\' library. (pip install elasticsearch)')
-
-# Setting up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d %I:%M:%S')
+ raise ImportError('Missing required \'elasticsearch\' library. (pip install elasticsearch)')
class ElasticIndexer:
- def __init__(self, args: argparse.Namespace):
- '''
- Initialize the Elastic Search indexer.
+ def __init__(self, args: argparse.Namespace):
+ '''
+ Initialize the Elastic Search indexer.
+
+ :param args: Parsed arguments from argparse
+ '''
+
+ self.chunk_max = args.chunk_max * 1024 * 1024 # MB
+ self.chunk_size = args.chunk_size
+ self.es_index = args.index
+
+ # Sniffing disabled due to an issue with the elasticsearch 8.x client (https://github.com/elastic/elasticsearch-py/issues/2005)
+ es_config = {
+ #'hosts' : [f'{args.host}:{args.port}'],
+ 'hosts' : [f'{args.host}:{port}' for port in ('9200', '9201', '9202')], # Temporary alternative to sniffing
+ 'verify_certs' : args.self_signed,
+ 'ssl_show_warn' : args.self_signed,
+ 'request_timeout' : args.timeout,
+ 'max_retries' : args.retries,
+ 'retry_on_timeout' : True
+ #'sniff_on_start': True,
+ #'sniff_on_node_failure': True,
+ #'min_delay_between_sniffing': 60
+ }
+
+ if args.api_key:
+ es_config['api_key'] = (args.api_key, '') # Verify this is correct
+ else:
+ es_config['basic_auth'] = (args.user, args.password)
+
+ self.es = AsyncElasticsearch(**es_config)
+
- :param args: Parsed arguments from argparse
- '''
+ async def create_index(self, map_body: dict, pipeline: str = None, replicas: int = 1, shards: int = 1):
+ '''
+ Create the Elasticsearch index with the defined mapping.
- self.chunk_max = args.chunk_max * 1024 * 1024 # MB
- self.chunk_size = args.chunk_size
- self.es = None
- self.es_index = args.index
+ :param map_body: Mapping for the index
+ :param pipeline: Name of the ingest pipeline to use for the index
+ :param replicas: Number of replicas for the index
+ :param shards: Number of shards for the index
+ '''
- self.es_config = {
- 'hosts': [f'{args.host}:{args.port}'],
- 'verify_certs': args.self_signed,
- 'ssl_show_warn': args.self_signed,
- 'request_timeout': args.timeout,
- 'max_retries': args.retries,
- 'retry_on_timeout': True,
- 'sniff_on_start': False, # Problems when True....
- 'sniff_on_node_failure': True,
- 'min_delay_between_sniffing': 60 # Add config option for this?
- }
+ if await self.es.indices.exists(index=self.es_index):
+ logging.info(f'Index \'{self.es_index}\' already exists.')
+ return
- #if args.api_key:
- # self.es_config['api_key'] = (args.api_key, '') # Verify this is correct
- #else:
- self.es_config['basic_auth'] = (args.user, args.password)
+ mapping = map_body
+ mapping['settings'] = {
+ 'number_of_shards' : shards,
+ 'number_of_replicas' : replicas
+ }
- async def initialize(self):
- '''Initialize the Elasticsearch client.'''
+ if pipeline:
+ try:
+ await self.es.ingest.get_pipeline(id=pipeline)
+ logging.info(f'Using ingest pipeline \'{pipeline}\' for index \'{self.es_index}\'')
+ mapping['settings']['index.default_pipeline'] = pipeline
+ except NotFoundError:
+ raise ValueError(f'Ingest pipeline \'{pipeline}\' does not exist.')
- # Patching the Elasticsearch client to fix a bug with sniffing (https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960)
- import sniff_patch
- self.es = await sniff_patch.init_elasticsearch(**self.es_config)
+ response = await self.es.indices.create(index=self.es_index, body=mapping)
- # Remove the above and uncomment the below if the bug is fixed in the Elasticsearch client:
- #self.es = AsyncElasticsearch(**es_config)
+ if response.get('acknowledged') and response.get('shards_acknowledged'):
+ logging.info(f'Index \'{self.es_index}\' successfully created.')
+ else:
+ raise Exception(f'Failed to create index. ({response})')
- async def create_index(self, map_body: dict, pipeline: str = '', replicas: int = 1, shards: int = 1):
- '''
- Create the Elasticsearch index with the defined mapping.
+ async def get_cluster_health(self) -> dict:
+ '''Get the health of the Elasticsearch cluster.'''
- :param map_body: Mapping for the index
- :param pipeline: Name of the ingest pipeline to use for the index
- :param replicas: Number of replicas for the index
- :param shards: Number of shards for the index
- '''
+ return await self.es.cluster.health()
- if await self.es.indices.exists(index=self.es_index):
- logging.info(f'Index \'{self.es_index}\' already exists.')
- return
- mapping = map_body
+ async def get_cluster_size(self) -> int:
+ '''Get the number of nodes in the Elasticsearch cluster.'''
- mapping['settings'] = {
- 'number_of_shards': shards,
- 'number_of_replicas': replicas
- }
+ cluster_stats = await self.es.cluster.stats()
+ number_of_nodes = cluster_stats['nodes']['count']['total']
- if pipeline:
- try:
- await self.es.ingest.get_pipeline(id=pipeline)
- logging.info(f'Using ingest pipeline \'{pipeline}\' for index \'{self.es_index}\'')
- mapping['settings']['index.default_pipeline'] = pipeline
- except NotFoundError:
- raise ValueError(f'Ingest pipeline \'{pipeline}\' does not exist.')
+ return number_of_nodes
- response = await self.es.indices.create(index=self.es_index, body=mapping)
- if response.get('acknowledged') and response.get('shards_acknowledged'):
- logging.info(f'Index \'{self.es_index}\' successfully created.')
- else:
- raise Exception(f'Failed to create index. ({response})')
+ async def process_data(self, file_path: str, data_generator: callable):
+ '''
+ Index records in chunks to Elasticsearch.
+ :param file_path: Path to the file
+ :param index_name: Name of the index
+ :param data_generator: Generator for the records to index
+ '''
- async def get_cluster_health(self) -> dict:
- '''Get the health of the Elasticsearch cluster.'''
+ count = 0
+ total = 0
- return await self.es.cluster.health()
+ try:
+ async for ok, result in async_streaming_bulk(self.es, actions=data_generator(file_path), chunk_size=self.chunk_size, max_chunk_bytes=self.chunk_max):
+ action, result = result.popitem()
+ if not ok:
+ logging.error(f'Failed to index document ({result["_id"]}) to {self.es_index} from {file_path} ({result})')
+ continue
- async def get_cluster_size(self) -> int:
- '''Get the number of nodes in the Elasticsearch cluster.'''
+ count += 1
+ total += 1
- cluster_stats = await self.es.cluster.stats()
- number_of_nodes = cluster_stats['nodes']['count']['total']
+ if count == self.chunk_size:
+ logging.info(f'Successfully indexed {self.chunk_size:,} ({total:,} processed) records to {self.es_index} from {file_path}')
+ count = 0
- return number_of_nodes
+ logging.info(f'Finished indexing {total:,} records to {self.es_index} from {file_path}')
+ except Exception as e:
+ raise Exception(f'Failed to index records to {self.es_index} from {file_path} ({e})')
- async def process_data(self, file_path: str, data_generator: callable):
- '''
- Index records in chunks to Elasticsearch.
- :param file_path: Path to the file
- :param index_name: Name of the index
- :param data_generator: Generator for the records to index
- '''
+def setup_logger(name: str, level: logging._Level = logging.INFO, to_file: bool = False, max_bytes: int = 250000, backups: int = 7) -> logging.Logger:
+ '''
+ Setup a custom logger with options for console and file logging.
- count = 0
- total = 0
+ :param name: Name of the logger.
+ :param level: Logging level.
+ :param to_file: Whether to log to a file.
+ :param max_bytes: Maximum size in bytes before rotating log file.
+ :param backups: Number of backup files to keep.
+ '''
- async for ok, result in async_streaming_bulk(self.es, actions=data_generator(file_path), chunk_size=self.chunk_size, max_chunk_bytes=self.chunk_max):
- action, result = result.popitem()
+ logger = logging.getLogger(name)
+ logger.setLevel(level)
- if not ok:
- logging.error(f'Failed to index document ({result["_id"]}) to {self.es_index} from {file_path} ({result})')
- input('Press Enter to continue...') # Debugging (will possibly remove this since we have retries enabled)
- continue
+ formatter_console = logging.Formatter('%(asctime)s | %(levelname)9s | %(message)s', '%I:%M:%S')
+ formatter_file = logging.Formatter('%(asctime)s | %(levelname)9s | %(filename)s.%(funcName)s | %(message)s', '%Y-%m-%d %I:%M:%S')
- count += 1
- total += 1
+ sh = logging.StreamHandler()
+ sh.setFormatter(formatter_console)
+ logger.addHandler(sh)
- if count == self.chunk_size:
- logging.info(f'Successfully indexed {self.chunk_size:,} ({total:,} processed) records to {self.es_index} from {file_path}')
- count = 0
+ if to_file:
+ os.makedirs('logs', exist_ok=True)
+ fh = logging.handlers.RotatingFileHandler('logs/debug.log', maxBytes=max_bytes, backupCount=backups, encoding='utf-8')
+ fh.setFormatter(formatter_file)
+ logger.addHandler(fh)
- logging.info(f'Finished indexing {total:,} records to {self.es_index} from {file_path}')
+ return logger
async def main():
- '''Main function when running this script directly.'''
-
- parser = argparse.ArgumentParser(description='Index data into Elasticsearch.')
-
- # General arguments
- parser.add_argument('input_path', help='Path to the input file or directory') # Required
- parser.add_argument('--watch', action='store_true', help='Create or watch a FIFO for real-time indexing')
-
- # Elasticsearch arguments
- parser.add_argument('--host', default='http://localhost/', help='Elasticsearch host')
- parser.add_argument('--port', type=int, default=9200, help='Elasticsearch port')
- parser.add_argument('--user', default='elastic', help='Elasticsearch username')
- parser.add_argument('--password', default=os.getenv('ES_PASSWORD'), help='Elasticsearch password (if not provided, check environment variable ES_PASSWORD)')
- #parser.add_argument('--api-key', default=os.getenv('ES_APIKEY'), help='Elasticsearch API Key for authentication (if not provided, check environment variable ES_APIKEY)')
- parser.add_argument('--self-signed', action='store_false', help='Elasticsearch is using self-signed certificates')
-
- # Elasticsearch indexing arguments
- parser.add_argument('--index', help='Elasticsearch index name')
- parser.add_argument('--pipeline', help='Use an ingest pipeline for the index')
- parser.add_argument('--replicas', type=int, default=1, help='Number of replicas for the index')
- parser.add_argument('--shards', type=int, default=1, help='Number of shards for the index')
-
- # Performance arguments
- parser.add_argument('--chunk-size', type=int, default=50000, help='Number of records to index in a chunk')
- parser.add_argument('--chunk-max', type=int, default=100, help='Maximum size of a chunk in bytes')
- parser.add_argument('--retries', type=int, default=100, help='Number of times to retry indexing a chunk before failing')
- parser.add_argument('--timeout', type=int, default=60, help='Number of seconds to wait before retrying a chunk')
-
- # Ingestion arguments
- parser.add_argument('--cert', action='store_true', help='Index Certstream records')
- parser.add_argument('--httpx', action='store_true', help='Index Httpx records')
- parser.add_argument('--masscan', action='store_true', help='Index Masscan records')
- parser.add_argument('--massdns', action='store_true', help='Index Massdns records')
- parser.add_argument('--zone', action='store_true', help='Index Zone records')
-
- args = parser.parse_args()
-
- if args.watch:
- if not os.path.exists(args.input_path):
- os.mkfifo(args.input_path)
- elif os.path.exists(args.input_path) and stat.S_ISFIFO(os.stat(args.input_path).st_mode):
- raise ValueError(f'Path {args.input_path} is not a FIFO')
- elif not os.path.isdir(args.input_path) and not os.path.isfile(args.input_path):
- raise FileNotFoundError(f'Input path {args.input_path} does not exist or is not a file or directory')
-
- edx = ElasticIndexer(args)
- await edx.initialize() # Initialize the Elasticsearch client asyncronously
-
- if args.cert:
- from ingestors import ingest_certs as ingestor
- if args.httpx:
- from ingestors import ingest_httpx as ingestor
- elif args.masscan:
- from ingestors import ingest_masscan as ingestor
- elif args.massdns:
- from ingestors import ingest_massdns as ingestor
- elif args.zone:
- from ingestors import ingest_zone as ingestor
-
- if not isinstance(ingestor, object):
- raise ValueError('No ingestor selected')
-
- health = await edx.get_cluster_health()
- print(health)
-
- await asyncio.sleep(5) # Delay to allow time for sniffing to complete
-
- nodes = await edx.get_cluster_size()
- logging.info(f'Connected to {nodes:,} Elasticsearch node(s)')
-
- if not edx.es_index:
- edx.es_index = ingestor.default_index
-
- map_body = ingestor.construct_map()
- await edx.create_index(map_body, args.pipeline, args.replicas, args.shards)
-
- if os.path.isfile(args.input_path):
- logging.info(f'Processing file: {args.input_path}')
- await edx.process_data(args.input_path, ingestor.process_data)
-
- elif stat.S_ISFIFO(os.stat(args.input_path).st_mode):
- logging.info(f'Watching FIFO: {args.input_path}')
- await edx.process_data(args.input_path, ingestor.process_data)
-
- elif os.path.isdir(args.input_path):
- count = 1
- total = len(os.listdir(args.input_path))
- logging.info(f'Processing {total:,} files in directory: {args.input_path}')
- for file in sorted(os.listdir(args.input_path)):
- file_path = os.path.join(args.input_path, file)
- if os.path.isfile(file_path):
- logging.info(f'[{count:,}/{total:,}] Processing file: {file_path}')
- await edx.process_data(file_path, ingestor.process_data)
- count += 1
- else:
- logging.warning(f'[{count:,}/{total:,}] Skipping non-file: {file_path}')
+ '''Main function when running this script directly.'''
+
+ parser = argparse.ArgumentParser(description='Elasticsearch Recon Ingestion Scripts (ERIS)')
+
+ # General arguments
+ parser.add_argument('input_path', help='Path to the input file or directory') # Required
+ parser.add_argument('--watch', action='store_true', help='Create or watch a FIFO for real-time indexing')
+
+ # Elasticsearch arguments
+ parser.add_argument('--host', default='http://localhost', help='Elasticsearch host')
+ parser.add_argument('--port', type=int, default=9200, help='Elasticsearch port')
+ parser.add_argument('--user', default='elastic', help='Elasticsearch username')
+ parser.add_argument('--password', default=os.getenv('ES_PASSWORD'), help='Elasticsearch password (if not provided, check environment variable ES_PASSWORD)')
+ parser.add_argument('--api-key', default=os.getenv('ES_APIKEY'), help='Elasticsearch API Key for authentication (if not provided, check environment variable ES_APIKEY)')
+ parser.add_argument('--self-signed', action='store_false', help='Elasticsearch is using self-signed certificates')
+
+ # Elasticsearch indexing arguments
+ parser.add_argument('--index', help='Elasticsearch index name')
+ parser.add_argument('--pipeline', help='Use an ingest pipeline for the index')
+ parser.add_argument('--replicas', type=int, default=1, help='Number of replicas for the index')
+ parser.add_argument('--shards', type=int, default=1, help='Number of shards for the index')
+
+ # Performance arguments
+ parser.add_argument('--chunk-size', type=int, default=50000, help='Number of records to index in a chunk')
+ parser.add_argument('--chunk-max', type=int, default=100, help='Maximum size of a chunk in bytes')
+ parser.add_argument('--retries', type=int, default=30, help='Number of times to retry indexing a chunk before failing')
+ parser.add_argument('--timeout', type=int, default=60, help='Number of seconds to wait before retrying a chunk')
+
+ # Ingestion arguments
+ parser.add_argument('--certstream', action='store_true', help='Index Certstream records')
+ parser.add_argument('--httpx', action='store_true', help='Index Httpx records')
+ parser.add_argument('--masscan', action='store_true', help='Index Masscan records')
+ parser.add_argument('--massdns', action='store_true', help='Index Massdns records')
+ parser.add_argument('--zone', action='store_true', help='Index Zone records')
+
+ args = parser.parse_args()
+
+ if args.host.endswith('/'):
+ args.host = args.host[:-1]
+
+ if args.watch:
+ if not os.path.exists(args.input_path):
+ os.mkfifo(args.input_path)
+ elif not stat.S_ISFIFO(os.stat(args.input_path).st_mode):
+ raise ValueError(f'Path {args.input_path} is not a FIFO')
+ elif not os.path.isdir(args.input_path) and not os.path.isfile(args.input_path):
+ raise FileNotFoundError(f'Input path {args.input_path} does not exist or is not a file or directory')
+
+ edx = ElasticIndexer(args)
+
+ if args.certstream:
+ from ingestors import ingest_certstream as ingestor
+ elif args.httpx:
+ from ingestors import ingest_httpx as ingestor
+ elif args.masscan:
+ from ingestors import ingest_masscan as ingestor
+ elif args.massdns:
+ from ingestors import ingest_massdns as ingestor
+ elif args.zone:
+ from ingestors import ingest_zone as ingestor
+ else:
+ raise ValueError('No ingestor specified')
+
+ health = await edx.get_cluster_health()
+ logging.info(health)
+
+ await asyncio.sleep(5) # Delay to allow time for sniffing to complete
+
+ nodes = await edx.get_cluster_size()
+ logging.info(f'Connected to {nodes:,} Elasticsearch node(s)')
+
+ if not edx.es_index:
+ edx.es_index = ingestor.default_index
+
+ map_body = ingestor.construct_map()
+ await edx.create_index(map_body, args.pipeline, args.replicas, args.shards)
+
+ if os.path.isfile(args.input_path):
+ logging.info(f'Processing file: {args.input_path}')
+ await edx.process_data(args.input_path, ingestor.process_data)
+
+ elif os.path.isdir(args.input_path):
+ count = 1
+ total = len(os.listdir(args.input_path))
+ logging.info(f'Processing {total:,} files in directory: {args.input_path}')
+ for file in sorted(os.listdir(args.input_path)):
+ file_path = os.path.join(args.input_path, file)
+ if os.path.isfile(file_path):
+ logging.info(f'[{count:,}/{total:,}] Processing file: {file_path}')
+ await edx.process_data(file_path, ingestor.process_data)
+ count += 1
+ else:
+ logging.warning(f'[{count:,}/{total:,}] Skipping non-file: {file_path}')
if __name__ == '__main__':
- asyncio.run(main())
+ setup_logger('eris', level=logging.INFO, to_file=True)
+ print('')
+ print('┏┓┳┓┳┏┓ Elasticsearch Recon Ingestion Scripts')
+ print('┣ ┣┫┃┗┓ Developed by Acidvegas in Python')
+ print('┗┛┛┗┻┗┛ https://git.acid.vegas/eris')
+ print('')
+ asyncio.run(main())
diff --git a/helpers/sniff_patch.py b/helpers/sniff_patch.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+# Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris)
+# sniff_patch.py [asyncronous developement]
+
+# Note:
+# This is a patch for the elasticsearch 8.x client to fix the sniff_* options.
+# This patch is only needed if you use the sniff_* options and only works with basic auth.
+# Call init_elasticsearch() with normal Elasticsearch params.
+#
+# Source:
+# - https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960
+
+import base64
+
+import elasticsearch._async.client as async_client
+from elasticsearch.exceptions import SerializationError, ConnectionError
+
+
+async def init_elasticsearch(*args, **kwargs):
+ '''
+ Initialize the Async Elasticsearch client with the sniff patch.
+
+ :param args: Async Elasticsearch positional arguments.
+ :param kwargs: Async Elasticsearch keyword arguments.
+ '''
+ async_client.default_sniff_callback = await _override_sniff_callback(kwargs['basic_auth'])
+
+ return async_client.AsyncElasticsearch(*args, **kwargs)
+
+
+async def _override_sniff_callback(basic_auth):
+ '''
+ Taken from https://github.com/elastic/elasticsearch-py/blob/8.8/elasticsearch/_sync/client/_base.py#L166
+ Completely unmodified except for adding the auth header to the elastic request.
+ Allows us to continue using the sniff_* options while this is broken in the library.
+
+ TODO: Remove this when this issue is patched:
+ - https://github.com/elastic/elasticsearch-py/issues/2005
+ '''
+ auth_str = base64.b64encode(':'.join(basic_auth).encode()).decode()
+ sniffed_node_callback = async_client._base._default_sniffed_node_callback
+
+ async def modified_sniff_callback(transport, sniff_options):
+ for _ in transport.node_pool.all():
+ try:
+ meta, node_infos = await transport.perform_request(
+ 'GET',
+ '/_nodes/_all/http',
+ headers={
+ 'accept': 'application/vnd.elasticsearch+json; compatible-with=8',
+ 'authorization': f'Basic {auth_str}' # This auth header is missing in 8.x releases of the client, and causes 401s
+ },
+ request_timeout=(
+ sniff_options.sniff_timeout
+ if not sniff_options.is_initial_sniff
+ else None
+ ),
+ )
+ except (SerializationError, ConnectionError):
+ continue
+
+ if not 200 <= meta.status <= 299:
+ continue
+
+ node_configs = []
+ for node_info in node_infos.get('nodes', {}).values():
+ address = node_info.get('http', {}).get('publish_address')
+ if not address or ':' not in address:
+ continue
+
+ if '/' in address:
+ # Support 7.x host/ip:port behavior where http.publish_host has been set.
+ fqdn, ipaddress = address.split('/', 1)
+ host = fqdn
+ _, port_str = ipaddress.rsplit(':', 1)
+ port = int(port_str)
+ else:
+ host, port_str = address.rsplit(':', 1)
+ port = int(port_str)
+
+ assert sniffed_node_callback is not None
+
+ # Pay not mind to this, it's just a workaround for my own setup.
+ #host = elastic.domain.com
+ #port = int(str(port).replace('', ''))
+
+ sniffed_node = sniffed_node_callback(node_info, meta.node.replace(host=host, port=port))
+
+ if sniffed_node is None:
+ continue
+
+ # Use the node which was able to make the request as a base.
+ node_configs.append(sniffed_node)
+
+ if node_configs:
+ return node_configs
+
+ return []
+
+ return modified_sniff_callback
diff --git a/ingestors/ingest_certs.py b/ingestors/ingest_certs.py
@@ -1,155 +0,0 @@
-#!/usr/bin/env python
-# Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris)
-# ingest_certs.py
-
-import asyncio
-import json
-import logging
-import time
-
-try:
- import websockets
-except ImportError:
- raise ImportError('Missing required \'websockets\' library. (pip install websockets)')
-
-
-# Set a default elasticsearch index if one is not provided
-default_index = 'cert-stream'
-
-
-def construct_map() -> dict:
- '''Construct the Elasticsearch index mapping for Certstream records.'''
-
- # Match on exact value or full text search
- keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
-
- # Construct the index mapping
- mapping = {
- 'mappings': {
- 'properties' : {
- 'domain' : keyword_mapping,
- 'seen' : { 'type': 'date' }
- }
- }
- }
-
- return mapping
-
-
-async def process_data(place_holder: str = None):
- '''
- Read and process Certsream records live from the Websocket stream.
-
- :param place_holder: Placeholder parameter to match the process_data function signature of other ingestors.
- '''
-
- while True:
- try:
- async with websockets.connect('wss://certstream.calidog.io') as websocket:
- while True:
- # Read a line from the websocket
- line = await websocket.recv()
-
- # Parse the JSON record
- try:
- record = json.loads(line)
- except json.decoder.JSONDecodeError:
- logging.error(f'Invalid line from the websocket: {line}')
- continue
-
- # Grab the unique domains from the record (excluding wildcards)
- domains = record['data']['leaf_cert']['all_domains']
- domains = set([domain[2:] if domain.startswith('*.') else domain for domain in domains])
-
- # Construct the document
- for domain in domains:
- struct = {
- 'domain' : domain,
- 'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
- }
-
- yield {'_id': id, '_index': default_index, '_source': struct}
-
- except websockets.ConnectionClosed:
- logging.error('Connection to Certstream was closed. Attempting to reconnect...')
- await asyncio.sleep(15)
-
- except Exception as e:
- logging.error(f'An error occurred while processing Certstream records! ({e})')
- break
-
-
-async def test():
- '''Test the ingestion process.'''
-
- async for document in process_data():
- print(document)
-
-
-
-if __name__ == '__main__':
- import asyncio
-
- asyncio.run(test())
-
-
-
-'''
-Output:
- {
- "data": {
- "cert_index": 43061646,
- "cert_link": "https://yeti2025.ct.digicert.com/log/ct/v1/get-entries?start=43061646&end=43061646",
- "leaf_cert": {
- "all_domains": [
- "*.d7zdnegbre53n.amplifyapp.com",
- "d7zdnegbre53n.amplifyapp.com"
- ],
- "extensions": {
- "authorityInfoAccess" : "CA Issuers - URI:http://crt.r2m02.amazontrust.com/r2m02.cer\nOCSP - URI:http://ocsp.r2m02.amazontrust.com\n",
- "authorityKeyIdentifier" : "keyid:C0:31:52:CD:5A:50:C3:82:7C:74:71:CE:CB:E9:9C:F9:7A:EB:82:E2\n",
- "basicConstraints" : "CA:FALSE",
- "certificatePolicies" : "Policy: 2.23.140.1.2.1",
- "crlDistributionPoints" : "Full Name:\n URI:http://crl.r2m02.amazontrust.com/r2m02.crl",
- "ctlPoisonByte" : true,
- "extendedKeyUsage" : "TLS Web server authentication, TLS Web client authentication",
- "keyUsage" : "Digital Signature, Key Encipherment",
- "subjectAltName" : "DNS:d7zdnegbre53n.amplifyapp.com, DNS:*.d7zdnegbre53n.amplifyapp.com",
- "subjectKeyIdentifier" : "59:32:78:2A:11:03:62:55:BB:3B:B9:80:24:76:28:90:2E:D1:A4:56"
- },
- "fingerprint": "D9:05:A3:D5:AA:F9:68:BC:0C:0A:15:69:C9:5E:11:92:32:67:4F:FA",
- "issuer": {
- "C" : "US",
- "CN" : "Amazon RSA 2048 M02",
- "L" : null,
- "O" : "Amazon",
- "OU" : null,
- "ST" : null,
- "aggregated" : "/C=US/CN=Amazon RSA 2048 M02/O=Amazon",
- "emailAddress" : null
- },
- "not_after" : 1743811199,
- "not_before" : 1709596800,
- "serial_number" : "FDB450C1942E3D30A18737063449E62",
- "signature_algorithm" : "sha256, rsa",
- "subject": {
- "C" : null,
- "CN" : "*.d7zdnegbre53n.amplifyapp.com",
- "L" : null,
- "O" : null,
- "OU" : null,
- "ST" : null,
- "aggregated" : "/CN=*.d7zdnegbre53n.amplifyapp.com",
- "emailAddress" : null
- }
- },
- "seen": 1709651773.594684,
- "source": {
- "name" : "DigiCert Yeti2025 Log",
- "url" : "https://yeti2025.ct.digicert.com/log/"
- },
- "update_type": "PrecertLogEntry"
- },
- "message_type": "certificate_update"
- }
-'''
diff --git a/ingestors/ingest_certstream.py b/ingestors/ingest_certstream.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python
+# Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris)
+# ingest_certs.py
+
+import asyncio
+import json
+import logging
+import time
+
+try:
+ import websockets
+except ImportError:
+ raise ImportError('Missing required \'websockets\' library. (pip install websockets)')
+
+
+# Set a default elasticsearch index if one is not provided
+default_index = 'cert-stream'
+
+
+def construct_map() -> dict:
+ '''Construct the Elasticsearch index mapping for Certstream records.'''
+
+ # Match on exact value or full text search
+ keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
+
+ # Construct the index mapping
+ mapping = {
+ 'mappings': {
+ 'properties' : {
+ 'domain' : keyword_mapping,
+ 'seen' : { 'type': 'date' }
+ }
+ }
+ }
+
+ return mapping
+
+
+async def process_data(place_holder: str = None):
+ '''
+ Read and process Certsream records live from the Websocket stream.
+
+ :param place_holder: Placeholder parameter to match the process_data function signature of other ingestors.
+ '''
+
+ while True:
+ try:
+ async with websockets.connect('wss://certstream.calidog.io') as websocket:
+ while True:
+ # Read a line from the websocket
+ line = await websocket.recv()
+
+ # Parse the JSON record
+ try:
+ record = json.loads(line)
+ except json.decoder.JSONDecodeError:
+ logging.error(f'Invalid line from the websocket: {line}')
+ continue
+
+ # Grab the unique domains from the record (excluding wildcards)
+ domains = record['data']['leaf_cert']['all_domains']
+ domains = set([domain[2:] if domain.startswith('*.') else domain for domain in domains])
+
+ # Construct the document
+ for domain in domains:
+ struct = {
+ 'domain' : domain,
+ 'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
+ }
+
+ yield {'_index': default_index, '_source': struct}
+
+ except websockets.ConnectionClosed:
+ logging.error('Connection to Certstream was closed. Attempting to reconnect...')
+ await asyncio.sleep(15)
+
+ except Exception as e:
+ logging.error(f'An error occurred while processing Certstream records! ({e})')
+ break
+
+
+async def test():
+ '''Test the ingestion process.'''
+
+ async for document in process_data():
+ print(document)
+
+
+
+if __name__ == '__main__':
+ import asyncio
+
+ asyncio.run(test())
+
+
+
+'''
+Output:
+ {
+ "data": {
+ "cert_index": 43061646,
+ "cert_link": "https://yeti2025.ct.digicert.com/log/ct/v1/get-entries?start=43061646&end=43061646",
+ "leaf_cert": {
+ "all_domains": [
+ "*.d7zdnegbre53n.amplifyapp.com",
+ "d7zdnegbre53n.amplifyapp.com"
+ ],
+ "extensions": {
+ "authorityInfoAccess" : "CA Issuers - URI:http://crt.r2m02.amazontrust.com/r2m02.cer\nOCSP - URI:http://ocsp.r2m02.amazontrust.com\n",
+ "authorityKeyIdentifier" : "keyid:C0:31:52:CD:5A:50:C3:82:7C:74:71:CE:CB:E9:9C:F9:7A:EB:82:E2\n",
+ "basicConstraints" : "CA:FALSE",
+ "certificatePolicies" : "Policy: 2.23.140.1.2.1",
+ "crlDistributionPoints" : "Full Name:\n URI:http://crl.r2m02.amazontrust.com/r2m02.crl",
+ "ctlPoisonByte" : true,
+ "extendedKeyUsage" : "TLS Web server authentication, TLS Web client authentication",
+ "keyUsage" : "Digital Signature, Key Encipherment",
+ "subjectAltName" : "DNS:d7zdnegbre53n.amplifyapp.com, DNS:*.d7zdnegbre53n.amplifyapp.com",
+ "subjectKeyIdentifier" : "59:32:78:2A:11:03:62:55:BB:3B:B9:80:24:76:28:90:2E:D1:A4:56"
+ },
+ "fingerprint": "D9:05:A3:D5:AA:F9:68:BC:0C:0A:15:69:C9:5E:11:92:32:67:4F:FA",
+ "issuer": {
+ "C" : "US",
+ "CN" : "Amazon RSA 2048 M02",
+ "L" : null,
+ "O" : "Amazon",
+ "OU" : null,
+ "ST" : null,
+ "aggregated" : "/C=US/CN=Amazon RSA 2048 M02/O=Amazon",
+ "emailAddress" : null
+ },
+ "not_after" : 1743811199,
+ "not_before" : 1709596800,
+ "serial_number" : "FDB450C1942E3D30A18737063449E62",
+ "signature_algorithm" : "sha256, rsa",
+ "subject": {
+ "C" : null,
+ "CN" : "*.d7zdnegbre53n.amplifyapp.com",
+ "L" : null,
+ "O" : null,
+ "OU" : null,
+ "ST" : null,
+ "aggregated" : "/CN=*.d7zdnegbre53n.amplifyapp.com",
+ "emailAddress" : null
+ }
+ },
+ "seen": 1709651773.594684,
+ "source": {
+ "name" : "DigiCert Yeti2025 Log",
+ "url" : "https://yeti2025.ct.digicert.com/log/"
+ },
+ "update_type": "PrecertLogEntry"
+ },
+ "message_type": "certificate_update"
+ }
+'''
diff --git a/sniff_patch.py b/sniff_patch.py
@@ -1,100 +0,0 @@
-#!/usr/bin/env python
-# Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris)
-# sniff_patch.py [asyncronous developement]
-
-# Note:
-# This is a patch for the elasticsearch 8.x client to fix the sniff_* options.
-# This patch is only needed if you use the sniff_* options and only works with basic auth.
-# Call init_elasticsearch() with normal Elasticsearch params.
-#
-# Source:
-# - https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960
-
-import base64
-
-import elasticsearch._async.client as async_client
-from elasticsearch.exceptions import SerializationError, ConnectionError
-
-
-async def init_elasticsearch(*args, **kwargs):
- '''
- Initialize the Async Elasticsearch client with the sniff patch.
-
- :param args: Async Elasticsearch positional arguments.
- :param kwargs: Async Elasticsearch keyword arguments.
- '''
- async_client.default_sniff_callback = _override_sniff_callback(kwargs['basic_auth'])
-
- return async_client.AsyncElasticsearch(*args, **kwargs)
-
-
-def _override_sniff_callback(basic_auth):
- '''
- Taken from https://github.com/elastic/elasticsearch-py/blob/8.8/elasticsearch/_sync/client/_base.py#L166
- Completely unmodified except for adding the auth header to the elastic request.
- Allows us to continue using the sniff_* options while this is broken in the library.
-
- TODO: Remove this when this issue is patched:
- - https://github.com/elastic/elasticsearch-py/issues/2005
- '''
- auth_str = base64.b64encode(':'.join(basic_auth).encode()).decode()
- sniffed_node_callback = async_client._base._default_sniffed_node_callback
-
- async def modified_sniff_callback(transport, sniff_options):
- for _ in transport.node_pool.all():
- try:
- meta, node_infos = await transport.perform_request(
- 'GET',
- '/_nodes/_all/http',
- headers={
- 'accept': 'application/vnd.elasticsearch+json; compatible-with=8',
- 'authorization': f'Basic {auth_str}' # This auth header is missing in 8.x releases of the client, and causes 401s
- },
- request_timeout=(
- sniff_options.sniff_timeout
- if not sniff_options.is_initial_sniff
- else None
- ),
- )
- except (SerializationError, ConnectionError):
- continue
-
- if not 200 <= meta.status <= 299:
- continue
-
- node_configs = []
- for node_info in node_infos.get('nodes', {}).values():
- address = node_info.get('http', {}).get('publish_address')
- if not address or ':' not in address:
- continue
-
- if '/' in address:
- # Support 7.x host/ip:port behavior where http.publish_host has been set.
- fqdn, ipaddress = address.split('/', 1)
- host = fqdn
- _, port_str = ipaddress.rsplit(':', 1)
- port = int(port_str)
- else:
- host, port_str = address.rsplit(':', 1)
- port = int(port_str)
-
- assert sniffed_node_callback is not None
-
- # Pay not mind to this, it's just a workaround for my own setup.
- #host = elastic.domain.com
- #port = int(str(port).replace('', ''))
-
- sniffed_node = sniffed_node_callback(node_info, meta.node.replace(host=host, port=port))
-
- if sniffed_node is None:
- continue
-
- # Use the node which was able to make the request as a base.
- node_configs.append(sniffed_node)
-
- if node_configs:
- return node_configs
-
- return []
-
- return modified_sniff_callback
| | | | | |