proxytools- collection of scripts for harvesting & testing proxies |
git clone git://git.acid.vegas/proxytools.git |
Log | Files | Refs | Archive | README | LICENSE |
sockhub.py (2805B)
1 #!/usr/bin/env python 2 # SockHub Proxy Scraper - Developed by acidvegas in Python (https://git.acid.vegas/proxytools) 3 4 ''' 5 There is a file in this repository called proxy_sources.txt which contains a list of URLs to scrape for proxies. 6 This list it not maintained and may contain dead links or links to sites that no longer contain proxies. 7 ''' 8 9 import concurrent.futures 10 import logging 11 import os 12 import re 13 import urllib.request 14 15 # Global 16 proxies = list() 17 18 def find_proxies(url: str) -> str: 19 ''' 20 Check a URL for IP:PORT proxies. 21 22 :param url: The URL to check for proxies. 23 ''' 24 25 global proxies 26 27 try: 28 source = urllib.request.urlopen(urllib.request.Request(url, headers={'User-Agent': 'SockHub/1.0'})).read().decode() 29 if source: 30 found = set(re.findall('[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+:[0-9]+', source, re.MULTILINE)) 31 if (new_proxies := [proxy for proxy in found if proxy not in proxies]): 32 proxies += new_proxies 33 print(f'found \033[32m{len(found):,}\033[0m new proxies on \033[34m{url}\033[0m') 34 else: 35 logging.warning(f'found \033[31m0\033[0m new proxies on \033[34m{url}\033[0m \033[30m(source is empty)\033[0m') 36 except Exception as ex: 37 logging.error(f'found \033[31m0\033[0m new proxies on \033[34m{url}\033[0m \033[30m({ex})\033[0m') 38 39 40 if __name__ == '__main__': 41 import argparse 42 parser = argparse.ArgumentParser(description='SockHub Proxy Scraper - Developed by acidvegas in Python (https://git.acid.vegas/proxytools)') 43 parser.add_argument('-i', '--input', help='input file containing a list of URLs to scrape (one per line) or a single URL') 44 parser.add_argument('-o', '--output', help='output file to save proxies to', default='proxies.txt') 45 parser.add_argument('-c', '--concurrency', help='number of concurrent threads to use (default: 10)', default=10, type=int) 46 args = parser.parse_args() 47 48 logging.basicConfig(format='%(message)s', level=logging.INFO) 49 50 if not os.path.isfile(args.input): 51 if args.input.startswith('https://') or args.input.startswith('http://'): 52 logging.info('using input as a single url...') 53 proxy_sources = [args.input] 54 else: 55 logging.fatal('input file does not exist!') 56 57 proxy_sources = open(args.input, 'r').read().split('\n') 58 59 if not proxy_sources: 60 logging.fatal('proxy sources input file is empty!') 61 62 logging.debug('scanning \033[35m{len(urls):,}\033[0m urls from list...') 63 64 with concurrent.futures.ThreadPoolExecutor(max_workers=args.concurrency) as executor: 65 futures = [executor.submit(find_proxies, url) for url in proxy_sources] 66 concurrent.futures.wait(futures) 67 68 if proxies: 69 logging.info('found \033[32m{len(proxies):,}\033[0m total proxies!') 70 proxies.sort() 71 with open (args.output, 'w') as output_file: 72 for proxy in proxies: 73 output_file.write(proxy + '\n') 74 else: 75 logging.warning('no proxies found!')