proxytools

- collection of scripts for harvesting & testing proxies
git clone git://git.acid.vegas/proxytools.git
Log | Files | Refs | Archive | README | LICENSE

sockhub.py (2787B)

      1 #!/usr/bin/env python
      2 # SockHub Proxy Scraper - Developed by acidvegas in Python (https://git.acid.vegas/proxytools)
      3 
      4 '''
      5 There is a file in this repository called proxy_sources.txt which contains a list of URLs to scrape for proxies.
      6 This list it not maintained and may contain dead links or links to sites that no longer contain proxies.
      7 '''
      8 
      9 import concurrent.futures
     10 import logging
     11 import os
     12 import re
     13 import urllib.request
     14 
     15 # Global
     16 proxies = list()
     17 
     18 def find_proxies(url: str) -> str:
     19 	'''
     20 	Check a URL for IP:PORT proxies.
     21 
     22 	:param url: The URL to check for proxies.
     23 	'''
     24 	try:
     25 		source = urllib.request.urlopen(urllib.request.Request(url, headers={'User-Agent': 'SockHub/1.0'})).read().decode()
     26 		if source:
     27 			found = set(re.findall('[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+:[0-9]+', source, re.MULTILINE))
     28 			if (new_proxies := [proxy for proxy in found if proxy not in proxies]):
     29 				proxies += new_proxies
     30 				print(f'found \033[32m{len(found):,}\033[0m new proxies on \033[34m{url}\033[0m')
     31 		else:
     32 			logging.warning(f'found \033[31m0\033[0m new proxies on \033[34m{url}\033[0m \033[30m(source is empty)\033[0m')
     33 	except Exception as ex:
     34 		logging.error(f'found \033[31m0\033[0m new proxies on \033[34m{url}\033[0m \033[30m({ex})\033[0m')
     35 
     36 
     37 if __name__ == '__main__':
     38 	import argparse
     39 	parser = argparse.ArgumentParser(description='SockHub Proxy Scraper - Developed by acidvegas in Python (https://git.acid.vegas/proxytools)')
     40 	parser.add_argument('-i', '--input',  help='input file containing a list of URLs to scrape (one per line) or a single URL')
     41 	parser.add_argument('-o', '--output', help='output file to save proxies to', default='proxies.txt')
     42 	parser.add_argument('-c', '--concurrency', help='number of concurrent threads to use (default: 10)', default=10, type=int)
     43 	args = parser.parse_args()
     44 
     45 	logging.basicConfig(format='%(message)s', level=logging.INFO)
     46 
     47 	if not os.path.isfile(args.input):
     48 		if args.input.startswith('https://') or args.input.startswith('http://'):
     49 			logging.info('using input as a single url...')
     50 			proxy_sources = [args.input]
     51 		else:
     52 			logging.fatal('input file does not exist!')
     53 
     54 	proxy_sources = open(args.input, 'r').read().split('\n')
     55 
     56 	if not proxy_sources:
     57 		logging.fatal('proxy sources input file is empty!')
     58 
     59 	logging.debug('scanning \033[35m{len(urls):,}\033[0m urls from list...')
     60 
     61 	with concurrent.futures.ThreadPoolExecutor(max_workers=args.concurrency) as executor:
     62 		futures = [executor.submit(find_proxies, url) for url in proxy_sources]
     63 	concurrent.futures.wait(futures)
     64 
     65 	if proxies:
     66 		logging.info('found \033[32m{len(proxies):,}\033[0m total proxies!')
     67 		proxies.sort()
     68 		with open (args.output, 'w') as output_file:
     69 			for proxy in proxies:
     70 				output_file.write(proxy + '\n')
     71 	else:
     72 		logging.warning('no proxies found!')