Difference between revisions of "TensorFlow: Movie Rating Prediction"

From OnnoWiki
Jump to navigation Jump to search
(Created page with "Download https://github.com/fivethirtyeight/data/blob/master/fandango/fandango_score_comparison.csv https://github.com/fivethirtyeight/data/blob/master/fandango/fandango_sc...")
 
 
(2 intermediate revisions by the same user not shown)
Line 4: Line 4:
 
  https://github.com/fivethirtyeight/data/blob/master/fandango/fandango_scrape.csv
 
  https://github.com/fivethirtyeight/data/blob/master/fandango/fandango_scrape.csv
  
 +
Source
 +
 +
import numpy as np
 +
import pandas as pd
 +
from scipy import stats
 +
import sklearn
 +
from sklearn.model_selection import train_test_split
 +
import tensorflow as tf
 +
import matplotlib
 +
import matplotlib.pyplot as plt
 +
import seaborn as sns
 +
import math
 +
 +
df = pd.read_csv('fandango_score_comparison.csv')
 +
print(df.head())
 +
df.rename(columns={'Metacritic_user_nom':'Metacritic_user_norm'},inplace=True)
 +
rankings_lst = ['Fandango_Stars',
 +
                'RT_user_norm',
 +
                'RT_norm',
 +
                'IMDB_norm',
 +
                'Metacritic_user_norm',
 +
                'Metacritic_norm']
 +
def my_heatmap(df):
 +
    import seaborn as sns
 +
    fig, axes = plt.subplots()
 +
    sns.heatmap(df, annot=True)
 +
    plt.show()
 +
    plt.close()
 +
my_heatmap(df[rankings_lst].corr(method='pearson'))
 +
RT_lst = df['RT_norm'] >= 4.
 +
my_heatmap(df[RT_lst][rankings_lst].corr(method='pearson'))
 +
 +
 +
 +
feature_cols = ['Fandango_Stars', 'RT_user_norm', 'RT_norm', 'Metacritic_user_norm', 'Metacritic_norm']
 +
X = df.loc[:, feature_cols]
 +
y = df['IMDB_norm']
 +
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=43)
 +
 +
dim = len(feature_cols)
 +
dim += 1
 +
X_train = X_train.assign( independent = pd.Series([1] * len(y_train), index=X_train.index))
 +
X_test = X_test.assign( independent = pd.Series([1] * len(y_train), index=X_test.index))
 +
 +
P_train = X_train.as_matrix(columns=None)
 +
P_test = X_test.as_matrix(columns=None)
 +
q_train = np.array(y_train.values).reshape(-1,1)
 +
q_test = np.array(y_test.values).reshape(-1,1)
 +
 +
P = tf.placeholder(tf.float32,[None,dim])
 +
q = tf.placeholder(tf.float32,[None,1])
 +
T = tf.Variable(tf.ones([dim,1]))
 +
bias = tf.Variable(tf.constant(1.0, shape = [dim]))
 +
q_ = tf.add(tf.matmul(P, T),bias)
 +
 +
 +
cost = tf.reduce_mean(tf.square(q_ - q))
 +
learning_rate = 0.0001
 +
training_op = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)
 +
 +
init_op = tf.global_variables_initializer()
 +
cost_history = np.empty(shape=[1],dtype=float)
 +
 +
training_epochs = 50000
 +
with tf.Session() as sess:
 +
    sess.run(init_op)
 +
    cost_history = np.empty(shape=[1], dtype=float)
 +
    t_history = np.empty(shape=[dim, 1], dtype=float)
 +
    for epoch in range(training_epochs):
 +
        sess.run(training_op, feed_dict={P: P_train, q: q_train})
 +
        cost_history = np.append(cost_history, sess.run(cost,feed_dict={P: P_train, q: q_train}))
 +
        t_history = np.append(t_history, sess.run(T, feed_dict={P: P_train, q: q_train}), axis=1)
 +
    q_pred = sess.run(q_, feed_dict={P: P_test})[:, 0]
 +
    mse = tf.reduce_mean(tf.square(q_pred - q_test))
 +
    mse_temp = mse.eval()
 +
    sess.close()
 +
 +
print(mse_temp)
 +
RMSE = math.sqrt(mse_temp)
 +
print(RMSE)
 +
 +
fig, axes = plt.subplots()
 +
plt.plot(range(len(cost_history)), cost_history)
 +
axes.set_xlim(xmin=0.95)
 +
axes.set_ylim(ymin=1.e-2)
 +
axes.set_xscale("log", nonposx='clip')
 +
axes.set_yscale("log", nonposy='clip')
 +
axes.set_ylabel('Training cost')
 +
axes.set_xlabel('Iterations')
 +
axes.set_title('Learning rate = ' + str(learning_rate))
 +
plt.show()
 +
plt.close()
 +
 +
 +
predictedDF = X_test.copy(deep=True)
 +
predictedDF.insert(loc=0, column='IMDB_norm_predicted', value=pd.
 +
Series(data=q_pred, index=predictedDF.index))
 +
predictedDF.insert(loc=0, column='IMDB_norm_actual', value=q_test)
 +
print('Predicted vs actual rating using LR with TensorFlow')
 +
print(predictedDF[['IMDB_norm_actual', 'IMDB_norm_predicted']].head())
 +
print(predictedDF[['IMDB_norm_actual', 'IMDB_norm_predicted']].tail())
 +
 +
 +
plt.scatter(q_test, q_pred, color='blue', alpha=0.5)
 +
plt.plot([q_test.min(), q_test.max()], [q_test.min(), q_test.max()], '--', lw=1)
 +
plt.title('Predicted vs Actual')
 +
plt.xlabel('Actual')
 +
plt.ylabel('Predicted')
 +
plt.show()
 +
plt.show()
  
  

Latest revision as of 10:35, 30 July 2019

Download

https://github.com/fivethirtyeight/data/blob/master/fandango/fandango_score_comparison.csv
https://github.com/fivethirtyeight/data/blob/master/fandango/fandango_scrape.csv

Source

import numpy as np
import pandas as pd
from scipy import stats
import sklearn
from sklearn.model_selection import train_test_split
import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import math

df = pd.read_csv('fandango_score_comparison.csv')
print(df.head())
df.rename(columns={'Metacritic_user_nom':'Metacritic_user_norm'},inplace=True)
rankings_lst = ['Fandango_Stars',
                'RT_user_norm',
                'RT_norm',
                'IMDB_norm',
                'Metacritic_user_norm',
                'Metacritic_norm']
def my_heatmap(df):
    import seaborn as sns
    fig, axes = plt.subplots()
    sns.heatmap(df, annot=True)
    plt.show()
    plt.close()
my_heatmap(df[rankings_lst].corr(method='pearson'))
RT_lst = df['RT_norm'] >= 4.
my_heatmap(df[RT_lst][rankings_lst].corr(method='pearson'))


feature_cols = ['Fandango_Stars', 'RT_user_norm', 'RT_norm', 'Metacritic_user_norm', 'Metacritic_norm']
X = df.loc[:, feature_cols]
y = df['IMDB_norm']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=43)
dim = len(feature_cols)
dim += 1
X_train = X_train.assign( independent = pd.Series([1] * len(y_train), index=X_train.index))
X_test = X_test.assign( independent = pd.Series([1] * len(y_train), index=X_test.index))
P_train = X_train.as_matrix(columns=None)
P_test = X_test.as_matrix(columns=None)
q_train = np.array(y_train.values).reshape(-1,1)
q_test = np.array(y_test.values).reshape(-1,1)
P = tf.placeholder(tf.float32,[None,dim])
q = tf.placeholder(tf.float32,[None,1])
T = tf.Variable(tf.ones([dim,1]))
bias = tf.Variable(tf.constant(1.0, shape = [dim]))
q_ = tf.add(tf.matmul(P, T),bias)


cost = tf.reduce_mean(tf.square(q_ - q))
learning_rate = 0.0001
training_op = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)
init_op = tf.global_variables_initializer()
cost_history = np.empty(shape=[1],dtype=float)
training_epochs = 50000
with tf.Session() as sess:
    sess.run(init_op)
    cost_history = np.empty(shape=[1], dtype=float)
    t_history = np.empty(shape=[dim, 1], dtype=float)
    for epoch in range(training_epochs):
        sess.run(training_op, feed_dict={P: P_train, q: q_train})
        cost_history = np.append(cost_history, sess.run(cost,feed_dict={P: P_train, q: q_train}))
        t_history = np.append(t_history, sess.run(T, feed_dict={P: P_train, q: q_train}), axis=1)
    q_pred = sess.run(q_, feed_dict={P: P_test})[:, 0]
    mse = tf.reduce_mean(tf.square(q_pred - q_test))
    mse_temp = mse.eval()
    sess.close()
print(mse_temp)
RMSE = math.sqrt(mse_temp)
print(RMSE)
fig, axes = plt.subplots()
plt.plot(range(len(cost_history)), cost_history)
axes.set_xlim(xmin=0.95)
axes.set_ylim(ymin=1.e-2)
axes.set_xscale("log", nonposx='clip')
axes.set_yscale("log", nonposy='clip')
axes.set_ylabel('Training cost')
axes.set_xlabel('Iterations')
axes.set_title('Learning rate = ' + str(learning_rate))
plt.show()
plt.close()


predictedDF = X_test.copy(deep=True)
predictedDF.insert(loc=0, column='IMDB_norm_predicted', value=pd.
Series(data=q_pred, index=predictedDF.index))
predictedDF.insert(loc=0, column='IMDB_norm_actual', value=q_test)
print('Predicted vs actual rating using LR with TensorFlow')
print(predictedDF'IMDB_norm_actual', 'IMDB_norm_predicted'.head())
print(predictedDF'IMDB_norm_actual', 'IMDB_norm_predicted'.tail())


plt.scatter(q_test, q_pred, color='blue', alpha=0.5)
plt.plot([q_test.min(), q_test.max()], [q_test.min(), q_test.max()], '--', lw=1)
plt.title('Predicted vs Actual')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()
plt.show()



Pranala Menarik