#!/usr/bin/python # # Author: Fabian Affolter # # Name: amb_count.py # # Release: 0.1 # Licence: GPL v.2 # # Changelog # 06.04.2008 first release import sys, urllib, os, re from BeautifulSoup import BeautifulSoup def countryList(countries): """ This function will get the names from Fedora Project category pages """ urlList = [] for country in countries: URL = urllib.urlopen ('http://fedoraproject.org/wiki/CategoryAmbassadors'+country) #print URL soup = BeautifulSoup(URL) for tag in soup("a"): attrs = dict(tag.attrs) links = str(tag) urlList.append(links) #print urlList return urlList def cleanList(rawHTML): result = [] for line in rawHTML: if "%28CategoryCategory%5Cb%29" in line: result.append(line) return result def get_names(cleanHTML): """ Extracting the names of the ambassadors. """ re1='.*?' # Non-greedy match on filler re2='(?:[a-z][a-z]+)' # Uninteresting re3='((?:[a-z][a-z]+))' # ((CategoryCategory)) result = [] for line in cleanHTML: #print URL rg = re.compile(re1+re2+re1+re2+re1+re3,re.IGNORECASE|re.DOTALL) A_name_temp = rg.search(line) A_name = A_name_temp.group(1) result.append(A_name) return result def doubleNames(all_names): """ Remove double entries...Gerold per example ;-) """ dic={} for i in all_names: dic[i]='' list=dic.keys() list.sort() return list def lengthList(names): """ Get the number of ambassadors """ length = len(names) return length def main(): # Countries in EMEA, only a few for testing (56 ambassadors in this 5 countries) countries=['France','Italy','Switzerland','Germany','Austria'] countryHTML = countryList(countries) cleanHTML = cleanList(countryHTML) all_names = get_names(cleanHTML) names = doubleNames(all_names) # If you want to know the names of the ambassadors in the region you selected #print names print lengthList(names) if __name__ == "__main__": sys.exit(main())