- Load Dataset
- Cleaning the data
- Filtering out movies
- Convert year data entry to pandas Datetime object for convenience
- Analyze annual movie releases
- Sort the movies released as per decades
import os
from requests import get
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import time as time
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import seaborn as sns
sns.set(style="whitegrid")
%config InlineBackend.figure_format = 'retina'
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
plot_params = {
'font.size' : 22,
'axes.titlesize' : 24,
'axes.labelsize' : 20,
'axes.labelweight' : 'bold',
'xtick.labelsize' : 16,
'ytick.labelsize' : 16,
}
plt.rcParams.update(plot_params)
names, year, imdb_rating, metascore, num_votes = [], [], [], [], []
start_time = time.time()
requests = 0
years_url = [str(i) for i in range(1950,2006)]
page_iter = [0, 51, 101, 151, 201]
for year_url in tqdm(years_url):
for page_num in tqdm(page_iter):
#URL to parse
url = 'https://www.imdb.com/search/title/?title_type=feature,&release_date={0},{0}&countries=in&languages=hi&sort=num_votes,desc&start={1}&ref_=adv_prv'.format(int(year_url), int(page_num))
response = get(url)
#Sleep to carve out load
time.sleep(np.random.randint(1,5))
#Estimate time elapsed per request
requests += 1
elapsed_time = time.time() - start_time
print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
clear_output(wait = True)
html_soup = BeautifulSoup(response.text, 'html.parser')
movie_containers = html_soup.find_all('div', class_='lister-item mode-advanced')
for i, container in enumerate(movie_containers):
container_entry = movie_containers[i]
movie_name = container_entry.h3.a.text
names.append(movie_name)
movie_year = container_entry.h3.find('span',class_='lister-item-year text-muted unbold').text.strip('()')
year.append(movie_year)
#print(movie_name, movie_year)
try:
movie_rating = float(container_entry.strong.text)
imdb_rating.append(movie_rating)
except AttributeError:
imdb_rating.append(np.nan)
try:
movie_votes = float(''.join(container_entry.find('span', attrs = {'name':'nv'}).text.split(',')))
num_votes.append(movie_votes)
except (AttributeError, ValueError):
num_votes.append(np.nan)
try:
movie_metascore = float(container_entry.find('span', class_='metascore').text.strip())
metascore.append(movie_metascore)
except AttributeError:
metascore.append(np.nan)
print('Making dataframe for year {}'.format(year_url))
df_movies = pd.DataFrame({'name':names,'year':year,'rating':imdb_rating,'metascore':metascore,'num_votes':num_votes})
df_movies.to_csv('./temp_imdb_files/bollywood_data_{}.csv'.format(year_url),sep=',',header=True, index=False)
del df_movies
df_movies = pd.read_csv('./IMDB-files/bollywood_movies_data_1950_2020_new.csv',sep=',', skipinitialspace=True)
df_movies.columns
df_movies.info()
df_movies['year'].unique()
Data pulled from the website has phantom characters alongside the dates. Hence this would need some cleaning from our end to ensure all the dates are in consistent format.
df_movies.shape
I am using strip
to loop each date entry in the dataset and strip off any residual characters which coincide with the those mentioned in the filter. Another option is to use replace
in pandas using regex
filters
df_movies['year'] = df_movies['year'].astype('str')
df_movies['year']=[i.strip('IIII) XVII) ( ( TV Special TV Mov') for i in df_movies['year'].tolist()]
Printing the data again to check for the date entries:
df_movies['year'].unique()
Consistency check for the dataframe shape to ensure no funny business
df_movies.shape
votes_filter = df_movies['num_votes'] > 50 #Filter out movies which have got less than 100 votes from IMDb users
df_movies_filter_votes = df_movies.loc[votes_filter].reset_index(drop=True) #Reset the indices of the new dataframe and drop the old ones -- if not done a different column with old index is appended
df_movies_filter_votes.shape
df_movies_filter_votes['year'] = pd.to_datetime(df_movies_filter_votes['year'],format='%Y').dt.year
stat_list = ['year', 'total_movies_year', 'highest_rated_movie', 'movie_rating','avg_num_votes', 'avg_movie_rating']
annual_movie_stats = {keys:[] for keys in stat_list}
for year_entry in df_movies_filter_votes['year'].unique():
per_year_column = df_movies_filter_votes.loc[df_movies_filter_votes['year'] == year_entry]
try:
movie_entry_with_max_ratings = df_movies_filter_votes.loc[per_year_column['rating'].idxmax()]
higest_movie_rating = movie_entry_with_max_ratings['rating']
highest_rated_movie = movie_entry_with_max_ratings['name']
avg_movie_rating = per_year_column['rating'].mean()
total_movies = len(per_year_column)
avg_num_votes = per_year_column['num_votes'].mean()
except ValueError:
higest_movie_rating = np.nan
highest_rated_movie = np.nan
total_movies = np.nan
avg_movie_rating = np.nan
annual_movie_stats['year'].append(year_entry)
annual_movie_stats['highest_rated_movie'].append(highest_rated_movie)
annual_movie_stats['movie_rating'].append(higest_movie_rating)
annual_movie_stats['avg_movie_rating'].append(avg_movie_rating)
annual_movie_stats['total_movies_year'].append(total_movies)
annual_movie_stats['avg_num_votes'].append(avg_num_votes)
df_annual_movie_stats = pd.DataFrame(annual_movie_stats, columns=annual_movie_stats.keys())
df_annual_movie_stats.sample(5)
fig, (ax1,ax2) = plt.subplots(2, 1, figsize=(30,20), sharex=True)
year_list = ["'{}".format(str(value)[2:]) for value in df_annual_movie_stats.year.to_list()]
sns.barplot(x=year_list, y='total_movies_year', color='k', alpha=0.8, data=df_annual_movie_stats, ax=ax1)
ax1.set_ylabel('Average movies released')
sns.scatterplot(year_list, 'avg_movie_rating', size='avg_num_votes', color='k', sizes=(40, 400), data=df_annual_movie_stats, ax=ax2);
ax2.set_xlabel('Year')
ax2.set_ylabel('Average movie rating')
ax2.get_legend()
for item in ax2.get_xticklabels():
item.set_rotation(45)
plt.tight_layout()
The two plots show the number of films released each year and the average IMDb rating for the movies released in that year. Now we might conclude that movies are getting selectively worse in spite of there being more movies being released, however the confidence in that statement is difficult to justify since the number of votes casted in these movies is an important parameter to keep in mind.
df_annual_movie_stats['decade'] = 10 * (df_annual_movie_stats['year']//10)
df_annual_movie_stats_decade = df_annual_movie_stats.groupby(['decade']).mean()
df_annual_movie_stats_decade.sample(5)
df_annual_movie_stats_decade.index
decade_list = ["{}s".format(str(value)[2:]) for value in df_annual_movie_stats_decade.index.to_list()]
fig, (ax1,ax2) = plt.subplots(2, 1, figsize=(15,10), sharex=True)
sns.barplot(x=decade_list, y='total_movies_year', data=df_annual_movie_stats_decade, color='k', alpha=0.8, ax=ax1)
ax1.set_ylabel('Average movies released annually')
sns.scatterplot(decade_list, 'avg_movie_rating', size='avg_num_votes', sizes=(100, 400), data=df_annual_movie_stats_decade, ax=ax2);
sns.despine()
ax2.set_xlabel('Decade')
ax2.set_ylabel('Average movie rating')
ax2.get_legend().remove()
plt.tight_layout()