random

- collection of un-sorted bollocks
git clone git://git.acid.vegas/random.git
Log | Files | Refs | Archive

craggle.py (2836B)

      1 #!/usr/bin/env python
      2 # CraigsList Parser - Developed by acidvegas in Python (https://acid.vegas/random)
      3 
      4 '''
      5 Random script to parse all the countries, states, cities, & sections/sub-sections on CraigsList
      6 Dont know what I am doing with this yet...
      7 '''
      8 
      9 import re, time, urllib.request
     10 
     11 def between(source, start, stop):
     12 	data = re.compile(start + '(.*?)' + stop, re.IGNORECASE|re.MULTILINE).search(source)
     13 	return data.group(1) if data else False
     14 
     15 def get_source(url):
     16 	source = urllib.request.urlopen(url, timeout=10)
     17 	charset = source.headers.get_content_charset()
     18 	return source.read().decode(charset) if charset else source.read().decode()
     19 
     20 db        = {'category':dict(),'subcat':dict()}
     21 source    = get_source('http://www.craigslist.org/about/sites?lang=en&cc=us')
     22 countries = re.findall('<h1><a name="(.*?)"></a>(.*?)</h1>', source, re.IGNORECASE|re.MULTILINE)
     23 source    = source.replace('\n', '').replace('\r','')
     24 main_data = dict()
     25 statess = 0
     26 citiess = 0
     27 for country in countries:
     28 	main_data[country[0].lower()] = dict()
     29 	data   = between(source, '<h1><a name="{0}"></a>{1}</h1>'.format(country[0], country[1]),'</a></li>                    </ul>                </div>            </div>')
     30 	states = re.findall('<h4>(.*?)</h4>', data, re.IGNORECASE|re.MULTILINE)
     31 	statess += len(states)
     32 	for state in states:
     33 		main_data[country[0].lower()][state.lower()] = dict()
     34 		state_data = between(source, f'<h4>{state}</h4>', '</ul>')
     35 		cities = re.findall('<li><a href="(.*?)">(.*?)</a></li>', state_data, re.IGNORECASE|re.MULTILINE)
     36 		citiess += len(cities)
     37 		for city in cities:
     38 			main_data[country[0].lower()][state.lower()][city[1]] = city[0].split('/?')[0]
     39 			new_source = get_source(city[0].split('/?')[0])
     40 			new_source = new_source.replace('\n', '').replace('\r','')
     41 			categories = re.findall('data-alltitle="all (.*?)" data-cat="(.*?)">', new_source, re.IGNORECASE|re.MULTILINE)
     42 			for category in categories:
     43 				db['category'][category[0]] = db['category'][category[0]]+1 if category[0] in db['category'] else 1
     44 				if category[0] != 'resumes':
     45 					cat = category[0].replace(' ','-')
     46 					category_data  = between(new_source, f'<h4 class="ban"><a href="/d/{cat}/search', '</ul></div></div>')
     47 					try:
     48 						sub_categories = re.findall('span class="txt">(.*?)<sup class', category_data, re.IGNORECASE|re.MULTILINE)
     49 						for sub_category in sub_categories:
     50 							print(f'{country[1]} | {state} | {city[1]} | {category[0]} | {sub_category}')
     51 							db['subcat'][sub_category] = db['subcat'][sub_category]+1 if sub_category in db['subcat'] else 1
     52 					except:
     53 						print('\n\n\nerror !!!')
     54 						print(category_data)
     55 						print(category)
     56 						input('')
     57 print(f'Country : {len(main_data)}')
     58 print(f'State   : {statess}')
     59 print(f'City    : {citiess}')
     60 print(str(db['category']))
     61 print('\n\n\n')
     62 print(str(db['subcat']))