#!/usr/bin/python
#
# Author:	Fabian Affolter <fab at fedoraproject dot org>
#
# Name:		amb_count.py
#
# Release:	0.1
# Licence:	GPL v.2
#
# Changelog
# 06.04.2008	first release

import sys, urllib, os, re
from BeautifulSoup import BeautifulSoup

def countryList(countries):
	"""
    	This function will get the names from Fedora Project category pages
    	"""
	urlList = []
	for country in countries:
		URL = urllib.urlopen ('http://fedoraproject.org/wiki/CategoryAmbassadors'+country)
		#print URL
		soup = BeautifulSoup(URL)
		for tag in soup("a"):
			attrs = dict(tag.attrs)
			links = str(tag)
			urlList.append(links)
			#print urlList
	return urlList
		
def cleanList(rawHTML):
	result = []
	for line in rawHTML:
		if "%28CategoryCategory%5Cb%29" in line:
			result.append(line)
	return result
	
def get_names(cleanHTML):
	"""
    	Extracting the names of the ambassadors. 
    	"""
	re1='.*?'	                # Non-greedy match on filler
	re2='(?:[a-z][a-z]+)'   	# Uninteresting
	re3='((?:[a-z][a-z]+))' 	# ((CategoryCategory))
	result = []
	for line in cleanHTML:
		#print URL
		rg = re.compile(re1+re2+re1+re2+re1+re3,re.IGNORECASE|re.DOTALL)
		A_name_temp = rg.search(line)
		A_name = A_name_temp.group(1)
		result.append(A_name)
	return result

def doubleNames(all_names):
	"""
    	Remove double entries...Gerold per example ;-)
    	"""
	dic={}
	for i in all_names: 
		dic[i]=''
	
	list=dic.keys()
	list.sort()
	return list
	
def lengthList(names):
	"""
    	Get the number of ambassadors
    	"""
	length = len(names)
	return length

def main():
    # Countries in EMEA, only a few for testing (56 ambassadors in this 5 countries)
    countries=['France','Italy','Switzerland','Germany','Austria']
    countryHTML = countryList(countries)
    cleanHTML = cleanList(countryHTML)
    all_names = get_names(cleanHTML)
    names = doubleNames(all_names)
    # If you want to know the names of the ambassadors in the region you selected
    #print names
    print lengthList(names)

if __name__ == "__main__":
	sys.exit(main())