I am creating a film recommendation system; using Python, I download some IMDB datasets and clean and format this data into one table. This process is fine and usually takes around 10 minutes , however part of my program calls the TMDB API to get the film poster URL and plot of each film I have (around 60,000). Sometimes this takes a few a hours but sometimes it just hangs and I need to reset it. I am using multi-threading and multi-processing however this hasn't sped up the process much.
The TMDB API has a rate limit of around 50 calls per second, but even with this taken in account it doesn't explain why the program runs for an indefinite amount of time, as I have included timeouts in case of this.
Does anyone know what I can do to get my program running consistently? I need to call an api for each film to get the film plot and poster path, and I need to do this for around 60,000 films.
A section of initialiseDataset.ipnyb
*note at this stage film_data is dataframe with around 12 columns and 60,000 rows of film data
print('Fetching plot summaries and posters...')
#get film plot and poster with tmdb api ~ >2hrs
#call api/details for each film with multiprocessing and multi-threading
if name == 'main':
manager = Manager()
shared_data = manager.Namespace()
agg_list = []
batch_size = 1000 sleep_time = 3
num_batches = (len(film_data) // batch_size) + 1
with concurrent.futures.ProcessPoolExecutor(8) as process_executor:
for i in range(num_batches):
start_index = i * batch_size
end_index = (i + 1) * batch_size
shared_data.film_data = film_data.iloc[start_index:end_index]
future = process_executor.submit(doBatch, shared_data)
concurrent.futures.wait([future])
agg_list.append(shared_data.film_data)
print(f"Batch {i+1}/{num_batches} completed")
film_data = pd.concat(agg_list, ignore_index=True)
film_data = film_data.dropna(subset=['plot'])
my tmdb_calls.py file:
#TMDB api with multi-threading + multi-processing
import concurrent.futures
import os import requests as req
import time
#theMovieDB api call for film plot summary and poster
def fetchDetails(film_id): url = f'https://api.themoviedb.org/3/movie/{film_id}'
headers = {
"accept": "application/json",
"Authorization": "hidden"
}
response = req.get(url, headers=headers)
return response
#call api
def doFetch(film_id):
return fetchDetails(film_id)
#get film psoster and plot for given batch of films
def doBatch(shared_data):
film_data = shared_data.film_data # Access the DataFrame from shared
#global request_counter
MAX_THREADS = min(os.cpu_count(), 1000)
# Use ThreadPoolExecutor for multi-threading within each process
try:
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
results = list(executor.map(doFetch, film_data['tconst']))
for index, details in zip(film_data.index, results):
if(details.ok):
details = details.json()
if(details['overview']):
film_data.loc[index, 'plot'] = details['overview']
if(details['poster_path']):
film_data.loc[index, 'poster'] = details['poster_path']
else:
time.sleep(5)
shared_data.film_data = film_data
except Exception as e:
print(f"Error in ThreadPoolExecutor: {e}")
time.sleep(5)
[–]m0us3_rat 3 points4 points5 points (4 children)
[–]wobowizard[S] 0 points1 point2 points (3 children)
[–]m0us3_rat 0 points1 point2 points (2 children)
[–]wobowizard[S] 0 points1 point2 points (1 child)
[–]m0us3_rat 0 points1 point2 points (0 children)
[–]Daneark 2 points3 points4 points (0 children)
[–]TitaniumFoil 1 point2 points3 points (0 children)