proxytools

- collection of scripts for harvesting & testing proxies
git clone git://git.acid.vegas/proxytools.git
sockhub.py (2805B)
      1 #!/usr/bin/env python
      2 # SockHub Proxy Scraper - Developed by acidvegas in Python (https://git.acid.vegas/proxytools)
      3 
      4 '''
      5 There is a file in this repository called proxy_sources.txt which contains a list of URLs to scrape for proxies.
      6 This list it not maintained and may contain dead links or links to sites that no longer contain proxies.
      7 '''
      8 
      9 import concurrent.futures
     10 import logging
     11 import os
     12 import re
     13 import urllib.request
     14 
     15 # Global
     16 proxies = list()
     17 
     18 def find_proxies(url: str) -> str:
     19 	'''
     20 	Check a URL for IP:PORT proxies.
     21 
     22 	:param url: The URL to check for proxies.
     23 	'''
     24 
     25 	global proxies
     26 
     27 	try:
     28 		source = urllib.request.urlopen(urllib.request.Request(url, headers={'User-Agent': 'SockHub/1.0'})).read().decode()
     29 		if source:
     30 			found = set(re.findall('[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+:[0-9]+', source, re.MULTILINE))
     31 			if (new_proxies := [proxy for proxy in found if proxy not in proxies]):
     32 				proxies += new_proxies
     33 				print(f'found \033[32m{len(found):,}\033[0m new proxies on \033[34m{url}\033[0m')
     34 		else:
     35 			logging.warning(f'found \033[31m0\033[0m new proxies on \033[34m{url}\033[0m \033[30m(source is empty)\033[0m')
     36 	except Exception as ex:
     37 		logging.error(f'found \033[31m0\033[0m new proxies on \033[34m{url}\033[0m \033[30m({ex})\033[0m')
     38 
     39 
     40 if __name__ == '__main__':
     41 	import argparse
     42 	parser = argparse.ArgumentParser(description='SockHub Proxy Scraper - Developed by acidvegas in Python (https://git.acid.vegas/proxytools)')
     43 	parser.add_argument('-i', '--input',  help='input file containing a list of URLs to scrape (one per line) or a single URL')
     44 	parser.add_argument('-o', '--output', help='output file to save proxies to', default='proxies.txt')
     45 	parser.add_argument('-c', '--concurrency', help='number of concurrent threads to use (default: 10)', default=10, type=int)
     46 	args = parser.parse_args()
     47 
     48 	logging.basicConfig(format='%(message)s', level=logging.INFO)
     49 
     50 	if not os.path.isfile(args.input):
     51 		if args.input.startswith('https://') or args.input.startswith('http://'):
     52 			logging.info('using input as a single url...')
     53 			proxy_sources = [args.input]
     54 		else:
     55 			logging.fatal('input file does not exist!')
     56 
     57 	proxy_sources = open(args.input, 'r').read().split('\n')
     58 
     59 	if not proxy_sources:
     60 		logging.fatal('proxy sources input file is empty!')
     61 
     62 	logging.debug('scanning \033[35m{len(urls):,}\033[0m urls from list...')
     63 
     64 	with concurrent.futures.ThreadPoolExecutor(max_workers=args.concurrency) as executor:
     65 		futures = [executor.submit(find_proxies, url) for url in proxy_sources]
     66 	concurrent.futures.wait(futures)
     67 
     68 	if proxies:
     69 		logging.info('found \033[32m{len(proxies):,}\033[0m total proxies!')
     70 		proxies.sort()
     71 		with open (args.output, 'w') as output_file:
     72 			for proxy in proxies:
     73 				output_file.write(proxy + '\n')
     74 	else:
     75 		logging.warning('no proxies found!')