Map/Reduce in Python with Quick Function Examples

We have map and reduce functions in Python. They both take two inputs, a function and a list/tuple.

Map can apply the function on multiple elements within the list/tuple.

Reduce can apply the function with the first two elements, get the result, and apply the function with the result and the 3rd element.

def f(x):
	return x*x

from functools import reduce
def fn(x,y):
	return x*10+y

from functools import reduce
def fn(x, y):
	return x * 10 + y
def char2num(s):
	return {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9}[s]
print(reduce(fn, map(char2num, '13579')))

To convert str into int

def str2int(s):
    def fn(x, y):
        return x * 10 + y
    def char2num(s):
        return {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9}[s]
    return reduce(fn, map(char2num, s))

To capitalize the str input, get multiple production, and convert str to float.

def normalize(name):
	print(list(map(str.capitalize, name)))
L1 = ['adam', 'LISA', 'barT']

def prod(L):
	def fn1(x,y):
		return x*y
	return reduce(fn1,L)

def str2float(p):
    p1 = p.split('.')[0]
    p2 = p.split('.')[1]
    return int(p1 + p2) / pow(10, len(p2))
###Tha map function takes a key-value pair and returns a list of key/value pairs
###1. In this example, the input would be the ID and output would be a given word
from collections import defaultdict

def map_word_count(id,doc):
  for word in doc.split():
  return list(counts.items())

###shuffle will group all the results
def shuffle_words(results_generators):
    records = defaultdict(list)
    for results in results_generators:
        for word, count in results:
    for word in records:
        yield (word, records[word])

###the final step is reduce step, which takes a key-value pair and produces 
#a key-value pair as result
def reduce_counts(word, list_of_counts):
    return (word, sum(list_of_counts))

###Now we load news dataset for a quick example 
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(subset='train')
documents =[:50]

map_results = map(map_word_count, range(len(documents)), documents)
shuffle_results = shuffle_words(map_results)
reduce_results = [reduce_counts(word, list_of_counts) 
for word, list_of_counts in shuffle_results]

#[('From:', 51), ('', 1), ("(where's", 1), ('my', 40), ('thing)', 1)]

###2. Take advantage parellel computing based on local cpu
from joblib import Parallel, delayed
map_results = Parallel(n_jobs=2)(delayed(map_word_count)(i, document)
                                 for i, document in enumerate(documents))
shuffle_results = shuffle_words(map_results)

Leave a Reply

Fill in your details below or click an icon to log in: Logo

You are commenting using your account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s