for year_url in tqdm(years_url):
for page_num in tqdm(page_iter):
#URL to parse
url = 'https://www.imdb.com/search/title/?title_type=feature,&release_date={0},{0}&countries=in&languages=hi&sort=num_votes,desc&start={1}&ref_=adv_prv'.format(int(year_url), int(page_num))
response = get(url)
#Sleep to carve out load
time.sleep(np.random.randint(1,5))
#Estimate time elapsed per request
requests += 1
elapsed_time = time.time() - start_time
print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
clear_output(wait = True)
html_soup = BeautifulSoup(response.text, 'html.parser')
movie_containers = html_soup.find_all('div', class_='lister-item mode-advanced')
for i, container in enumerate(movie_containers):
container_entry = movie_containers[i]
movie_name = container_entry.h3.a.text
names.append(movie_name)
movie_year = container_entry.h3.find('span',class_='lister-item-year text-muted unbold').text.strip('()')
year.append(movie_year)
#print(movie_name, movie_year)
try:
movie_rating = float(container_entry.strong.text)
imdb_rating.append(movie_rating)
except AttributeError:
imdb_rating.append(np.nan)
try:
movie_votes = float(''.join(container_entry.find('span', attrs = {'name':'nv'}).text.split(',')))
num_votes.append(movie_votes)
except (AttributeError, ValueError):
num_votes.append(np.nan)
try:
movie_metascore = float(container_entry.find('span', class_='metascore').text.strip())
metascore.append(movie_metascore)
except AttributeError:
metascore.append(np.nan)
print('Making dataframe for year {}'.format(year_url))
df_movies = pd.DataFrame({'name':names,'year':year,'rating':imdb_rating,'metascore':metascore,'num_votes':num_votes})
df_movies.to_csv('./temp_imdb_files/bollywood_data_{}.csv'.format(year_url),sep=',',header=True, index=False)
del df_movies