random- collection of un-sorted bollocks |
git clone git://git.acid.vegas/random.git |
Log | Files | Refs | Archive |
craggle.py (2836B)
1 #!/usr/bin/env python 2 # CraigsList Parser - Developed by acidvegas in Python (https://acid.vegas/random) 3 4 ''' 5 Random script to parse all the countries, states, cities, & sections/sub-sections on CraigsList 6 Dont know what I am doing with this yet... 7 ''' 8 9 import re, time, urllib.request 10 11 def between(source, start, stop): 12 data = re.compile(start + '(.*?)' + stop, re.IGNORECASE|re.MULTILINE).search(source) 13 return data.group(1) if data else False 14 15 def get_source(url): 16 source = urllib.request.urlopen(url, timeout=10) 17 charset = source.headers.get_content_charset() 18 return source.read().decode(charset) if charset else source.read().decode() 19 20 db = {'category':dict(),'subcat':dict()} 21 source = get_source('http://www.craigslist.org/about/sites?lang=en&cc=us') 22 countries = re.findall('<h1><a name="(.*?)"></a>(.*?)</h1>', source, re.IGNORECASE|re.MULTILINE) 23 source = source.replace('\n', '').replace('\r','') 24 main_data = dict() 25 statess = 0 26 citiess = 0 27 for country in countries: 28 main_data[country[0].lower()] = dict() 29 data = between(source, '<h1><a name="{0}"></a>{1}</h1>'.format(country[0], country[1]),'</a></li> </ul> </div> </div>') 30 states = re.findall('<h4>(.*?)</h4>', data, re.IGNORECASE|re.MULTILINE) 31 statess += len(states) 32 for state in states: 33 main_data[country[0].lower()][state.lower()] = dict() 34 state_data = between(source, f'<h4>{state}</h4>', '</ul>') 35 cities = re.findall('<li><a href="(.*?)">(.*?)</a></li>', state_data, re.IGNORECASE|re.MULTILINE) 36 citiess += len(cities) 37 for city in cities: 38 main_data[country[0].lower()][state.lower()][city[1]] = city[0].split('/?')[0] 39 new_source = get_source(city[0].split('/?')[0]) 40 new_source = new_source.replace('\n', '').replace('\r','') 41 categories = re.findall('data-alltitle="all (.*?)" data-cat="(.*?)">', new_source, re.IGNORECASE|re.MULTILINE) 42 for category in categories: 43 db['category'][category[0]] = db['category'][category[0]]+1 if category[0] in db['category'] else 1 44 if category[0] != 'resumes': 45 cat = category[0].replace(' ','-') 46 category_data = between(new_source, f'<h4 class="ban"><a href="/d/{cat}/search', '</ul></div></div>') 47 try: 48 sub_categories = re.findall('span class="txt">(.*?)<sup class', category_data, re.IGNORECASE|re.MULTILINE) 49 for sub_category in sub_categories: 50 print(f'{country[1]} | {state} | {city[1]} | {category[0]} | {sub_category}') 51 db['subcat'][sub_category] = db['subcat'][sub_category]+1 if sub_category in db['subcat'] else 1 52 except: 53 print('\n\n\nerror !!!') 54 print(category_data) 55 print(category) 56 input('') 57 print(f'Country : {len(main_data)}') 58 print(f'State : {statess}') 59 print(f'City : {citiess}') 60 print(str(db['category'])) 61 print('\n\n\n') 62 print(str(db['subcat']))