# import statements for all of the packages used
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline
# Upgrade pandas to use dataframe.explode() function.
!pip install --upgrade pandas==0.25.0;
# Load your data
df = pd.read_csv('tmdb-movies.csv')
# Show a sample of the data
df.head()
df.shape
df.describe()
df.info()
# Check for duplicate rows
sum(df.duplicated())
# Remove duplicate rows
df.drop_duplicates(inplace=True);
# Change release_date to datetime
df["release_date"] = pd.to_datetime(df["release_date"])
# Extract month from release_date
df['month'] = pd.DatetimeIndex(df['release_date']).month
# Remove columns that are specific to the data and won't be used in analysis
df.drop(['id', 'imdb_id', 'original_title', 'homepage', 'tagline', 'keywords', 'overview', 'runtime', 'production_companies', 'cast', 'director', 'release_date'], axis = 1, inplace = True)
df.head() # check remaining data
df.info() # check remaining data
# Show histograms of popularity and revenue
fig, axes =plt.subplots(1, 2)
fig.tight_layout()
df.popularity.hist(label='popularity', ax=axes[0]);
df.revenue_adj.hist(label='revenue', ax=axes[1]);
axes[0].set_title('Popularity histogram');
axes[0].set_xlabel('Popularity value');
axes[1].set_title('Revenue (adjusted) histogram');
axes[1].set_xlabel('Revenue (adjusted)');
fig.text(-0.05, 0.5, 'Number of movies', va='center', rotation='vertical');
# Remove rows with 0 revenue for analysis
df2 = df[df['revenue_adj']>0]
# Scatter plot of revenue (adjusted) vs popularity
df2.plot(x='revenue_adj', y='popularity', kind='scatter');
plt.xlabel('Revenue (adjusted)');
plt.ylabel('Popularity');
plt.title('Scatter plot of Revenue (adjusted) vs Popularity');
# Show histograms of vote_average and popularity
fig, axes =plt.subplots(1, 2)
fig.tight_layout()
df.popularity.hist(label='popularity', ax=axes[0]);
df.vote_average.hist(label='vote_average', ax=axes[1]);
axes[0].set_title('Popularity histogram');
axes[0].set_xlabel('Popularity value');
axes[1].set_title('Vote average histogram');
axes[1].set_xlabel('Vote average');
fig.text(-0.05, 0.5, 'Number of movies', va='center', rotation='vertical');
# Scatter plot of popularity vs vote average
df.plot(x='vote_average', y='popularity', kind='scatter');
plt.xlabel('Vote average');
plt.ylabel('Popularity');
plt.title('Scatter plot of Vote average vs Popularity');
# Remove the 3 outliers with very high popularity
df4 = df[df['popularity']<15]
df4.plot(x='vote_average', y='popularity', kind='scatter');
plt.xlabel('Vote average');
plt.ylabel('Popularity');
plt.title('Altered scatter plot of Vote average vs Popularity');
# Remove rows with 0 revenue and 0 budget for analysis
df3 = df2[df2['budget_adj']>0]
# Scatter plot of revenue (adjusted) vs budget (adjusted)
df3.plot(x='budget_adj', y='revenue_adj', kind='scatter');
plt.xlabel('Budget (adjusted)');
plt.ylabel('Revenue (adjusted)');
plt.suptitle('Scatter plot of Revenue (adjusted) vs Budget (adjusted)', y=1);
# Group by month and get average revenue
df2.groupby('month').revenue_adj.mean().plot(kind= 'bar');
plt.xlabel('Month');
plt.ylabel('Mean revenue (adjusted)');
plt.title('Bar plot of mean revenue (adjusted) per month');
# How many movies are released each month
df2.groupby('month').month.count().plot(kind= 'bar');
plt.xlabel('Month');
plt.ylabel('Movie count');
plt.title('Bar plot of movie count per month');
# Average revenue (adjuseted) generated per year
df2.groupby('release_year').revenue_adj.mean().plot(kind= 'bar', figsize=(14,8));
plt.xlabel('Release year', size=20);
plt.ylabel('Mean revenue (adjusted)', size=20);
plt.title('Bar plot of mean revenue (adjusted) per year', size=20);
# Overall revenue generated per year
df2.groupby('release_year').revenue_adj.sum().plot(kind= 'bar', figsize=(14,8));
plt.xlabel('Release year', size=20);
plt.ylabel('Overall revenue (adjusted)', size=20);
plt.title('Bar plot of overall revenue (adjusted) per year', size=20);
# How many movies were released each year?
df2.groupby('release_year').release_year.count().plot(kind= 'bar', figsize=(14,8));
plt.xlabel('Release year', size=20);
plt.ylabel('Movie Count', size=20);
plt.title('Bar plot of movie count per year', size=20);
# We have to separate the genres' rows with multiple genres into separate rows
# Make several copies of the dataframe
df5=df2.copy()
df6=df2.copy()
df7=df2.copy()
df8=df2.copy()
df9=df2.copy()
df10=df2.copy()
# Drop genre rows with null values
df5.dropna(axis=0, inplace=True)
# Convert each cell in the the genres column to a list
df5['genres']=df5['genres'].apply(lambda x: x.split("|"))
df5['genres'].str.len().max() # Shows that the maximum number of genres in a genres' cell is 5
# df6 is a dataframe containing rows with only 1 genre
df6=df6[df5['genres'].str.len()==1]
df_part1 = df6.copy()
# df7 is a dataframe containing rows with 2 genres
df7=df7[df5['genres'].str.len()==2]
# Split each row in df7 into 2 rows in 2 dataframes, each one containing one of the genres
df_part2 = df7.copy()
df_part2 ['genres']=df_part2['genres'].apply(lambda x: x.split("|")[0])
df_part3 = df7.copy()
df_part3 ['genres']=df_part3['genres'].apply(lambda x: x.split("|")[1])
# df8 is a dataframe containing rows with 3 genres
df8=df8[df5['genres'].str.len()==3]
# Split each row in df8 into 3 rows in 3 dataframes, each one containing one of the genres
df_part4 = df8.copy()
df_part4 ['genres']=df_part4['genres'].apply(lambda x: x.split("|")[0])
df_part5 = df8.copy()
df_part5 ['genres']=df_part5['genres'].apply(lambda x: x.split("|")[1])
df_part6 = df8.copy()
df_part6 ['genres']=df_part6['genres'].apply(lambda x: x.split("|")[2])
# df9 is a dataframe containing rows with 4 genres
df9=df9[df5['genres'].str.len()==4]
# Split each row in df9 into 4 rows in 4 dataframes, each one containing one of the genres
df_part7 = df9.copy()
df_part7 ['genres']=df_part7['genres'].apply(lambda x: x.split("|")[0])
df_part8 = df9.copy()
df_part8 ['genres']=df_part8['genres'].apply(lambda x: x.split("|")[1])
df_part9 = df9.copy()
df_part9 ['genres']=df_part9['genres'].apply(lambda x: x.split("|")[2])
df_part10 = df9.copy()
df_part10 ['genres']=df_part10['genres'].apply(lambda x: x.split("|")[3])
# df10 is a dataframe containing rows with 5 genres
df10=df10[df5['genres'].str.len()==5]
# Split each row in df10 into 5 rows in 5 dataframes, each one containing one of the genres
df_part11 = df10.copy()
df_part11 ['genres']=df_part11['genres'].apply(lambda x: x.split("|")[0])
df_part12 = df10.copy()
df_part12 ['genres']=df_part12['genres'].apply(lambda x: x.split("|")[1])
df_part13 = df10.copy()
df_part13 ['genres']=df_part13['genres'].apply(lambda x: x.split("|")[2])
df_part14 = df10.copy()
df_part14 ['genres']=df_part14['genres'].apply(lambda x: x.split("|")[3])
df_part15 = df10.copy()
df_part15 ['genres']=df_part15['genres'].apply(lambda x: x.split("|")[4])
# Create a new dataframe combining all the previous created dataframes with only 1 genre in the genres column
new_df=df_part1.append(df_part2).append(df_part3).append(df_part4).append(df_part5).append(df_part6).append(df_part7).append(df_part8).append(df_part9).append(df_part10).append(df_part11).append(df_part12).append(df_part13).append(df_part14).append(df_part15)
# Show which genre produce a higher average revenue (adjusted)
new_df.groupby('genres').revenue_adj.mean().plot(kind= 'bar', figsize=(12,8));
plt.xlabel('Genre', size = 20);
plt.ylabel('Mean revenue (adjusted)', size = 20);
plt.title('Bar plot of mean revenue (adjusted) per genre', size=20);
# Which genres were more popular?
new_df.groupby('genres').popularity.mean().plot(kind= 'bar', figsize=(12,8));
plt.xlabel('Genre', size = 20);
plt.ylabel('Popularity', size = 20);
plt.title('Bar plot of popularity per genre', size=20);
Adventure and science fiction were the most popular genres followed by fantasy, animation and action. Documentary, foreign and tv movies were the least popular genres.
Comparing this plot with the previous one shows that popular genres and the revenue associated with that genre are correlated.
# Which genre had more movies released?
new_df.groupby('genres').genres.count().plot(kind= 'bar', figsize=(12,8));
plt.xlabel('Genre', size = 20);
plt.ylabel('Movie count', size = 20);
plt.title('Bar plot of movie count per genre', size=20);
In this project we be analyzed a data set containing information about 1000 movies collected from The Movie Database (TMDb). We were interested in finding correlations and trends between movie revenues, popularity and different properties associated with them.
The data was first cleaned by removing duplicate rows and unnecessary columns for the analysis. The date column was also converted to datetime and the month extracted to a separate column.
The analysis showed that there was no correlation between a movie's popularity or budget and the revenue it generated. The analysis also showed that popular movies tend to have a rating higher than 5 but that a high rating did not necessarily mean a movie being popular.
Movies released in June generate the most revenue whilst movies released in September generate the lowest revenue on average. There is not much variation in average movie revenues (adjusted) throughout the years after the year 1960. More movies and overall revenue is generated throughout the years.
Animation and action movies generated more revenue per movie whereas documentary and foreign movies generated the lowest revenue per movie. Adventure and science fiction were the most popular genres whereas documentary, foreign and tv movies were the least popular genres. There is a correlation between popular genres and the revenue they generate.
There was no correlation found between popularity and revenue. A deeper understanding of the value 'popularity' and where it came from would be beneficial.
There was a lot of missing budget and revenue values. They are provided as zeros. I had to remove this data as any assumptions about it would only distort results and analysis. Having this data would improve analysis.
Several cells also had missing values including directors, production companies and genres. I removed the missing genres rows as they were only a few. However several director and production company fields are missing and this can affect future analysis on their correlation with movies revenue and popularity.
Further analysis can be performed to find out whether certain directors or cast members affect a movie's popularity and revenue. Further investegation of production companies can also be performed with regards to popularity and revenues. The popularity and revenues of certain genres throughout the years can also be analysed.
from subprocess import call
call(['python', '-m', 'nbconvert', 'Investigate_a_Dataset.ipynb'])