Find out which relations between different Indian spices
Spices are central to Indian cuisine. What is referred to colloquially as ‘Indian’ food is made of many different sub-cuisines. As a result, there are a plethora of spices usually brought up when considering ‘Indian’ food. Knowing which spices are most frequently used can help cooks novice or seasoned to make an informed decision about spices that promise the most bang for the buck.
I use a Kaggle dataset containing 6000+ recipes from https://www.archanaskitchen.com/. Using this data as base collection of recipes representing most of the indian food, I analyze which spices occur most freqeuntly and which spices are most connected to each other.
- Dataset for Indian recipe: This dataset 6000+ recipe scrapped from | Link to the dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import seaborn as sns
%config InlineBackend.figure_format = 'retina'
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
plot_params = {
'font.size' : 22,
'axes.titlesize' : 24,
'axes.labelsize' : 20,
'xtick.labelsize' : 16,
'ytick.labelsize' : 16,
}
plt.rcParams.update(plot_params)
food_df = pd.read_csv('./data/IndianFoodDatasetCSV.csv')
food_df.columns
food_df.shape
columns_to_drop = ['CookTimeInMins', 'Servings', 'Course', 'Diet', 'Instructions', 'TranslatedInstructions', 'URL']
food_df = food_df.drop(columns = columns_to_drop).dropna()
cuisines_to_drop = ['Mexican', 'Italian Recipes', 'Thai', 'Chinese', 'Asian', 'Middle Eastern', 'European',
'Arab', 'Japanese', 'Vietnamese', 'British', 'Greek', 'French', 'Mediterranean', 'Sri Lankan',
'Indonesian', 'African', 'Korean', 'American', 'Carribbean', 'World Breakfast', 'Malaysian', 'Dessert',
'Afghan', 'Snack', 'Jewish', 'Brunch', 'Lunch', 'Continental', 'Fusion']
food_df = food_df.loc[ ~ food_df['Cuisine'].isin(cuisines_to_drop) ] #Dropping entries in `food_df` which have non-indian cuisines
food_df.shape
food_df.head(5)
def filter_english(string):
try:
string.encode('utf-8').decode('ascii')
out = True
except UnicodeDecodeError:
out = False
return out
df = food_df.loc[ food_df['TranslatedIngredients'].apply(filter_english) ]
df.shape
df = df.reset_index()
wiki_file_pd = pd.read_html('https://en.wikipedia.org/wiki/List_of_Indian_spices')
spices_list = wiki_file_pd[0]['Standard English'].copy().str.lower()
#some important spices to add
spices_to_add = pd.Series(['black salt', 'green chillies', 'chilli powder'])
#some spices are too common (such as pepper) or not a spice, but a vegetable, or are otherwise corrupted (for example,
#cardamom is often listed as "cardamom" nto specifying whether it is black or green)
spices_to_drop = ['black pepper', 'capers', 'chili pepper powder', 'cinnamon buds', 'citric acid', 'garlic', 'capsicum', 'charoli', 'garcinia gummi-gutta', 'inknut', 'garcinia indica',
'black mustard seeds/raee', 'cumin seed ground into balls', 'dried ginger', 'green chili pepper', 'long pepper', 'four seeds', 'cubeb', 'gum tragacanth', 'jakhya', 'licorice powder',
'indian bedellium tree', 'mango extract', 'coriander powder', 'saffron pulp', 'black cardamom', 'brown mustard seed', 'black cumin', 'panch phoron']
spices_list = spices_list.loc[ ~spices_list.isin(spices_to_drop) ].append(spices_to_add).reset_index(drop=True)
spices_list
One more step is editing the spices so that my string counter can find different versions of the same spice.
spices_list = spices_list.str.replace('amchoor', 'amchur/amchoor/mango extract') \
.replace('asafoetida', 'asafetida/asafoetida/hing') \
.replace('thymol/carom seed', 'ajwain/thymol/carom seed') \
.replace('alkanet root', 'alkanet/alkanet root') \
.replace('chilli powder', 'red chilli powder/chilli powder/kashmiri red chilli powder') \
.replace('celery / radhuni seed', 'celery/radhuni seed') \
.replace('bay leaf, indian bay leaf', 'bay leaf/bay leaves/tej patta') \
.replace('curry tree or sweet neem leaf', 'curry leaf/curry leaves') \
.replace('fenugreek leaf', 'fenugreek/kasoori methi') \
.replace('nigella seed', 'nigella/black cumin') \
.replace('ginger', 'dried ginger/ginger powder') \
.replace('cloves', 'cloves/laung') \
.replace('green cardamom', 'cardamom/green cardamom/black cardamom')\
.replace('indian gooseberry', 'indian gooseberry/amla')\
.replace('coriander seed', 'coriander seed/coriander powder')\
.replace('star aniseh', 'star anise')\
.replace('cumin seed', 'cumin powder/cumin seeds/cumin/jeera')
spices_list
ingredients_series = df[['TranslatedRecipeName','TranslatedIngredients']]
ingredients_series
spices_list_column_to_add = {i: np.zeros(len(ingredients_series)) for i in spices_list.to_list()}
ingredients_series = ingredients_series.join(pd.DataFrame(spices_list_column_to_add))
ingredients_series
import re
def search_spice(ingredient_string, spice_string):
'''
Check if a spice exists in the list of ingredients for a recipe
'''
spice_list = spice_string.split('/')
for _spice in spice_list:
if re.search(_spice.lower(), ingredient_string.lower()):
return True
break
for row, values in ingredients_series.iterrows():
for spice_entry in spices_list:
if search_spice(values['TranslatedIngredients'], spice_entry):
ingredients_series.loc[row, spice_entry] = 1
else:
ingredients_series.loc[row, spice_entry] = 0
food_spice_mix = ingredients_series.drop(['TranslatedIngredients'], axis=1).reset_index(drop=True)
food_spice_mix.rename(columns={'amchur/amchoor/mango extract':'amchoor', \
'asafetida/asafoetida/hing': 'asafoetida', \
'ajwain/thymol/carom seed': 'ajwain', \
'alkanet/alkanet root': 'alkanet root', \
'red chilli powder/chilli powder/kashmiri red chilli powder': 'chilli powder', \
'celery/radhuni seed': 'celery seeds',\
'bay leaf/bay leaves/tej patta': 'bay leaf', \
'curry leaf/curry leaves': 'curry leaves',\
'fenugreek/kasoori methi': 'fenugreek leaf', \
'nigella/black cumin': 'nigella seed', \
'ginger': 'dried ginger',\
'cloves/laung': 'cloves', \
'cardamom/green cardamom/black cardamom': 'cardamom',\
'indian gooseberry/amla': 'indian gooseberry',\
'coriander seed/coriander powder': 'coriander seeds/powder',\
'cumin powder/cumin seeds/cumin/jeera': 'cumin seeds/powder',\
'dried ginger/ginger powder': 'ginger powder'}, inplace=True)
food_spice_mix.columns
food_spice_mix = food_spice_mix.sort_index(axis=1)
num_spice = len(spices_list)
spice_col_name = [i for i in food_spice_mix.columns[1:].to_list()]
spice_adj = pd.DataFrame(np.zeros(shape=(len(spices_list),len(spices_list))), columns= spice_col_name, index=spice_col_name)
spice_adj_freq = pd.DataFrame(np.zeros(shape=(len(spices_list),len(spices_list))), columns= spice_col_name, index=spice_col_name)
for row, value in food_spice_mix.iterrows():
for i in spice_col_name:
for j in spice_col_name:
if (value[i] == 1) & (value[j] == 1):
spice_adj_freq.loc[i,j] += 1
spice_adj.loc[i,j] = 1
Normalize the spice occurance frequency with the total entries in the main dataset
spice_adj_freq = spice_adj_freq / len(food_spice_mix) * 100
spice_adj_freq.round(2)
temp_name = [i.title() for i in spice_adj_freq.index.to_list()]
spice_adj_freq['Plot_name'] = temp_name
spice_adj_freq = spice_adj_freq.set_index('Plot_name')
spice_adj_freq.columns = temp_name
spice_adj_freq
fig, ax = plt.subplots(1,1, figsize=(10,10))
sns.heatmap(spice_adj_freq.round(2).corr(), ax=ax)
#plt.savefig("heatmap.png", format="PNG", dpi=300)
Using frequency adjacency matrix we can plot a heatmap showing the pair-wise occurence for a given pair of spices. The idea with such an analysis is that if we can check the variation of Spice 1 with all the other spices in the list and compare that to Spice 2’s variation with all the other spices in the list, if spice 1 and spice 2 should have similar variation.
This map itself is quite interesting. The color intensity of each title shows the frequency that pair of spice occurred together in a recipe. Brighter the color higher their occurence together.
Some prominent spice pairs which show similarity are:
-
Curry leaves and Mustard seeds
-
Tumeric and Chilli Powder
Some pair of spices never occur together:
-
Saffron and Fenugreek seeds
-
Nutmeg and Mustard Seeds
Those who cook or know indian recipes would see that these pairs make sense and thereby validate the correlation seen from corpus of Indian recipes.
With that analysis, we can go a step further and analyze this information in form of a circular network graph. Using this method of plotting, we can see the interactions between different spices.
import networkx as nx
nodes_data = [(i, {'count':spice_adj_freq.loc[i, i]}) for i in temp_name]
binary_int = []
for i in temp_name:
binary_int.append((i, spice_adj_freq.loc[i].sort_values(ascending=False).index[1]))
spice_dict = {i : spice_adj_freq.loc[i, i] for i in temp_name }
spice_dict
edges_data = []
for i in temp_name:
for j in temp_name:
if i != j:
if spice_adj_freq.loc[i,j] != 0.0:
edges_data.append((i, j, {'weight':spice_adj_freq.loc[i,j], 'distance':1}))
#BUILD THE INITIAL FULL GRAPH
G=nx.Graph()
G.add_nodes_from(nodes_data)
G.add_edges_from(edges_data)
print(nx.info(G))
deg_l = {i:G.degree(i) for i in temp_name}
highest_centrality_node = max(deg_l.items(), key=lambda x: x[1])[0]
highest_centrality_node
n = len(nodes_data)
edges = G.edges()
weights = [G[u][v]['weight'] for u,v in edges]
w_arr = np.array(weights)
norm_weight = (w_arr - w_arr.min())/(w_arr.max() - w_arr.min())
angle = []
angle_dict = {}
node_list = sorted(G.nodes())
for i, node in zip(np.arange(n),node_list):
theta = 2.0*np.pi*i/n
angle.append((np.cos(theta),np.sin(theta)))
angle_dict[node] = theta
pos = {}
for node_i, node in enumerate(node_list):
pos[node] = angle[node_i]
fig, ax = plt.subplots(figsize=(20,20))
margin=0.33
fig.subplots_adjust(margin, margin, 1.-margin, 1.-margin)
ax.axis('equal')
nx.draw(G,pos=pos,with_labels=False, node_size=[spice_dict[k]*20 for k in spice_dict], width=norm_weight*2.0, node_color=np.arange(n), cmap=plt.cm.viridis, ax=ax)
description = nx.draw_networkx_labels(G,pos)
r = fig.canvas.get_renderer()
trans = plt.gca().transData.inverted()
for node, t in description.items():
bb = t.get_window_extent(renderer=r)
bbdata = bb.transformed(trans)
radius = 1.1+bbdata.width/2
position = (radius*np.cos(angle_dict[node]),radius* np.sin(angle_dict[node]))
t.set_position(position)
t.set_rotation(angle_dict[node]*360.0/(2.0*np.pi))
t.set_clip_on(False)
#plt.savefig("Graph.png", format="PNG", dpi=300)
Finally a networkx circular graph is made where each node is a spice entry. Each edge between a pair of spice is a connection provided those two spices are found together in a recipe. The size of the node is the frequency of that spice to occur in all of 6000 food recipes. The thickness of the edge connecting a give spice-pair is the normalized frequency that pair occured among 6000 recipes.
Representing the analysis this way we find few key takeaways:
-
Tumeric, Mustard Seeds, Chilli Powder, Corriander Seeds, Cumin Seeds, Curry Leaves, Green Chillies, Asafoetida are the key spices in the Indian cuisine.
-
Most recipes use Tumeric + Chilli Powder + Cumin Powder (Seeds) in them.