for year_url in tqdm(years_url):
for page_num in tqdm(page_iter):
#URL to parse
url = 'https://www.imdb.com/search/title/?title_type=feature,&release_date= {0} , {0} &countries=in&languages=hi&sort=num_votes,desc&start= {1} &ref_=adv_prv' .format (int (year_url), int (page_num))
response = get(url)
#Sleep to carve out load
time.sleep(np.random.randint(1 ,5 ))
#Estimate time elapsed per request
requests += 1
elapsed_time = time.time() - start_time
print ('Request: {} ; Frequency: {} requests/s' .format (requests, requests/ elapsed_time))
clear_output(wait = True )
html_soup = BeautifulSoup(response.text, 'html.parser' )
movie_containers = html_soup.find_all('div' , class_= 'lister-item mode-advanced' )
for i, container in enumerate (movie_containers):
container_entry = movie_containers[i]
movie_name = container_entry.h3.a.text
names.append(movie_name)
movie_year = container_entry.h3.find('span' ,class_= 'lister-item-year text-muted unbold' ).text.strip('()' )
year.append(movie_year)
#print(movie_name, movie_year)
try :
movie_rating = float (container_entry.strong.text)
imdb_rating.append(movie_rating)
except AttributeError :
imdb_rating.append(np.nan)
try :
movie_votes = float ('' .join(container_entry.find('span' , attrs = {'name' :'nv' }).text.split(',' )))
num_votes.append(movie_votes)
except (AttributeError , ValueError ):
num_votes.append(np.nan)
try :
movie_metascore = float (container_entry.find('span' , class_= 'metascore' ).text.strip())
metascore.append(movie_metascore)
except AttributeError :
metascore.append(np.nan)
print ('Making dataframe for year {} ' .format (year_url))
df_movies = pd.DataFrame({'name' :names,'year' :year,'rating' :imdb_rating,'metascore' :metascore,'num_votes' :num_votes})
df_movies.to_csv('./temp_imdb_files/bollywood_data_ {} .csv' .format (year_url),sep= ',' ,header= True , index= False )
del df_movies