proxytools- collection of scripts for harvesting & testing proxies |
git clone git://git.acid.vegas/proxytools.git |
Log | Files | Refs | Archive | README | LICENSE |
sockhub.py (2787B)
1 #!/usr/bin/env python 2 # SockHub Proxy Scraper - Developed by acidvegas in Python (https://git.acid.vegas/proxytools) 3 4 ''' 5 There is a file in this repository called proxy_sources.txt which contains a list of URLs to scrape for proxies. 6 This list it not maintained and may contain dead links or links to sites that no longer contain proxies. 7 ''' 8 9 import concurrent.futures 10 import logging 11 import os 12 import re 13 import urllib.request 14 15 # Global 16 proxies = list() 17 18 def find_proxies(url: str) -> str: 19 ''' 20 Check a URL for IP:PORT proxies. 21 22 :param url: The URL to check for proxies. 23 ''' 24 try: 25 source = urllib.request.urlopen(urllib.request.Request(url, headers={'User-Agent': 'SockHub/1.0'})).read().decode() 26 if source: 27 found = set(re.findall('[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+:[0-9]+', source, re.MULTILINE)) 28 if (new_proxies := [proxy for proxy in found if proxy not in proxies]): 29 proxies += new_proxies 30 print(f'found \033[32m{len(found):,}\033[0m new proxies on \033[34m{url}\033[0m') 31 else: 32 logging.warning(f'found \033[31m0\033[0m new proxies on \033[34m{url}\033[0m \033[30m(source is empty)\033[0m') 33 except Exception as ex: 34 logging.error(f'found \033[31m0\033[0m new proxies on \033[34m{url}\033[0m \033[30m({ex})\033[0m') 35 36 37 if __name__ == '__main__': 38 import argparse 39 parser = argparse.ArgumentParser(description='SockHub Proxy Scraper - Developed by acidvegas in Python (https://git.acid.vegas/proxytools)') 40 parser.add_argument('-i', '--input', help='input file containing a list of URLs to scrape (one per line) or a single URL') 41 parser.add_argument('-o', '--output', help='output file to save proxies to', default='proxies.txt') 42 parser.add_argument('-c', '--concurrency', help='number of concurrent threads to use (default: 10)', default=10, type=int) 43 args = parser.parse_args() 44 45 logging.basicConfig(format='%(message)s', level=logging.INFO) 46 47 if not os.path.isfile(args.input): 48 if args.input.startswith('https://') or args.input.startswith('http://'): 49 logging.info('using input as a single url...') 50 proxy_sources = [args.input] 51 else: 52 logging.fatal('input file does not exist!') 53 54 proxy_sources = open(args.input, 'r').read().split('\n') 55 56 if not proxy_sources: 57 logging.fatal('proxy sources input file is empty!') 58 59 logging.debug('scanning \033[35m{len(urls):,}\033[0m urls from list...') 60 61 with concurrent.futures.ThreadPoolExecutor(max_workers=args.concurrency) as executor: 62 futures = [executor.submit(find_proxies, url) for url in proxy_sources] 63 concurrent.futures.wait(futures) 64 65 if proxies: 66 logging.info('found \033[32m{len(proxies):,}\033[0m total proxies!') 67 proxies.sort() 68 with open (args.output, 'w') as output_file: 69 for proxy in proxies: 70 output_file.write(proxy + '\n') 71 else: 72 logging.warning('no proxies found!')