#include #include #include #include #include #include #include #include "Structures.h" #include "ParseTest.h" #include "util.h" #include "Algorithms.h" using namespace std; const string outputProbeFileName = "/home/gregstoll/stuff/netflixprize/download/dummyoutput.txt"; const string outputPredictionFileName = "/home/gregstoll/stuff/netflixprize/download/realoutput.txt"; vector averageMovieRatings; map averageUserRatings; vector movieCorrelations; vector movieCorrelationsBoth; map userCorrelations; map > userToEntriesMap; map > probeEntriesMap; vector curMovieEntries; class EntryByGreaterAverageRating { public: bool operator () (const Entry &e1, const Entry &e2) const { return (movieCorrelations[e1.getMovieId()] > movieCorrelations[e2.getMovieId()]); } }; const bool useOnlyTop300 = false; class GlobalAverage { public: GlobalAverage() { } void initialize(bool isProbe) { } float getExpectedRating(const Entry& e, bool isProbe) { return 3.60429; } }; class UserBasedCorrelation { public: UserBasedCorrelation() { curUserId = -1; curMovieId = -1; } void initialize(bool isProbe) { } float getExpectedRating(const Entry& e, bool isProbe) { // Use the correlation data to make an average. if (curUserId != e.getUserId()) { curUserId = e.getUserId(); userCorrelations.clear(); appendDataFromUserCorrelations(isProbe, e.getUserId(), userCorrelations); } if (curMovieId != e.getMovieId()) { curMovieId = e.getMovieId(); movieEntries.clear(); appendEntriesFromMovieFile(curMovieId, movieEntries, true); if (isProbe) { filterEntries(movieEntries, probeEntriesMap); } } double sumOfRatings = 0.0; double sumOfCorrelations = 0.0; map::const_iterator it; double averageUserRating = averageUserRatings[e.getUserId()].averageRating; set top100Entries; vector::const_iterator entryIt; // Iterate over all the similar users... for (it = userCorrelations.begin(); it != userCorrelations.end(); ++it) { int similarUserId = it->first; double curCorrelation = it->second; // See if this user has rated this movie. for(entryIt = movieEntries.begin(); entryIt != movieEntries.end(); ++entryIt) { if (entryIt->getUserId() == similarUserId) { double averageRating = averageUserRatings[similarUserId].averageRating; sumOfRatings += curCorrelation * (((double)entryIt->getRating()) - averageRating); sumOfCorrelations += fabs(curCorrelation); break; } } } float rating; if (sumOfCorrelations < 0.00001) { // Not enough data to guess...just return the average. rating = 0.0; } else { rating = sumOfRatings / sumOfCorrelations; } return (rating + averageUserRatings[e.getUserId()].averageRating); } int curUserId; int curMovieId; vector movieEntries; }; class MovieBasedCorrelation { public: MovieBasedCorrelation() { curMovieId = -1; } void initialize(bool isProbe) { vector entries; for (int i = 1; i <= numMovies; i++) { if (i % 250 == 0) { cout << "Working on movie " << i << endl; } appendEntriesFromMovieFile(i, entries, true); if (isProbe) { filterEntries(entries, probeEntriesMap); } for (vector::const_iterator pos = entries.begin(); pos != entries.end(); ++pos) { int userId = pos->getUserId(); userToEntriesMap[userId].push_back(*pos); } entries.clear(); } } float getExpectedRating(const Entry& e, bool isProbe) { // Use the correlation data to make an average. if (curMovieId != e.getMovieId()) { curMovieId = e.getMovieId(); movieCorrelations.clear(); appendDataFromMovieCorrelations(isProbe, e.getMovieId(), movieCorrelations); } double sumOfRatings = 0.0; double sumOfCorrelations = 0.0; vector::const_iterator it; double averageUserRating = averageUserRatings[e.getUserId()].averageRating; set top100Entries; for (it = userToEntriesMap[e.getUserId()].begin(); it != userToEntriesMap[e.getUserId()].end(); ++it) { Entry curEntry = *it; double curCorrelation = movieCorrelations[curEntry.getMovieId()]; if (!useOnlyTop300) { //sumOfRatings += curCorrelation * (curEntry.getRating() - averageUserRating - (averageMovieRatings[curEntry.getMovieId()].averageRating - averageMovieRatings[curMovieId].averageRating)); sumOfRatings += curCorrelation * (curEntry.getRating() - averageUserRating); sumOfCorrelations += fabs(curCorrelation); } else { // insert into our list. top100Entries.insert(curEntry); } } if (useOnlyTop300) { // Just use the first 100 entries of the set. set::const_iterator it; int i = 0; for (it = top100Entries.begin(); it != top100Entries.end() && i < 300; ++it, ++i) { Entry curEntry = *it; double curCorrelation = movieCorrelations[curEntry.getMovieId()]; sumOfRatings += curCorrelation * (curEntry.getRating() - averageUserRating - (averageMovieRatings[curEntry.getMovieId()].averageRating - averageMovieRatings[curMovieId].averageRating)); sumOfCorrelations += fabs(curCorrelation); } } float rating = sumOfRatings / sumOfCorrelations; return (rating + averageUserRating); } int curMovieId; }; static bool greaterMovieCorrelation(const MovieCorrelation& mc1, const MovieCorrelation& mc2) { return (mc1.correlation > mc2.correlation); } struct MovieEntryComparator { MovieEntryComparator(int mId) : movieId(mId) { } bool operator()(const Entry& e1) const { return e1.getMovieId() == movieId; } int movieId; }; class UserAndMovieBasedCorrelation { public: UserAndMovieBasedCorrelation() : delta(.2), lambda(.2) { curUserId = -1; curMovieId = -1; } void initialize(bool isProbe) { vector entries; for (int i = 1; i <= numMovies; i++) { if (i % 250 == 0) { cout << "Working on movie " << i << endl; } appendEntriesFromMovieFile(i, entries, true); if (isProbe) { filterEntries(entries, probeEntriesMap); } for (vector::const_iterator pos = entries.begin(); pos != entries.end(); ++pos) { int userId = pos->getUserId(); userToEntriesMap[userId].push_back(*pos); } entries.clear(); } } float getExpectedRating(const Entry& e, bool isProbe) { // Use the correlation data to make an average. if (curUserId != e.getUserId()) { curUserId = e.getUserId(); userCorrelations.clear(); appendDataFromUserCorrelations(isProbe, e.getUserId(), userCorrelations); } if (curMovieId != e.getMovieId()) { curMovieId = e.getMovieId(); movieEntries.clear(); appendEntriesFromMovieFile(curMovieId, movieEntries, true); if (isProbe) { filterEntries(movieEntries, probeEntriesMap); } movieCorrelations.clear(); appendDataFromMovieCorrelations(isProbe, e.getMovieId(), movieCorrelations); movieCorrelationsBoth.clear(); appendDataFromMovieCorrelations(isProbe, e.getMovieId(), movieCorrelationsBoth); // Sort by correlation. std::sort(movieCorrelationsBoth.begin(), movieCorrelationsBoth.end(), greaterMovieCorrelation); } double sumOfUserRatings = 0.0, sumOfMovieRatings = 0.0, sumOfUserMovieRatings = 0.0; double sumOfUserCorrelations = 0.0, sumOfMovieCorrelations = 0.0, sumOfUserMovieCorrelations = 0.0; map::const_iterator userIt; vector::const_iterator movieIt; double averageUserRating = averageUserRatings[e.getUserId()].averageRating; set top100Entries; vector::const_iterator entryIt; // First iterate over all the movies. for (movieIt = userToEntriesMap[e.getUserId()].begin(); movieIt != userToEntriesMap[e.getUserId()].end(); ++movieIt) { Entry curEntry = *movieIt; double curCorrelation = movieCorrelations[curEntry.getMovieId()]; sumOfMovieRatings += curCorrelation * (curEntry.getRating() - averageUserRating); sumOfMovieCorrelations += fabs(curCorrelation); } // Iterate over all the similar users... for (userIt = userCorrelations.begin(); userIt != userCorrelations.end(); ++userIt) { int similarUserId = userIt->first; double curCorrelation = userIt->second; // See if this user has rated this movie. for(entryIt = movieEntries.begin(); entryIt != movieEntries.end(); ++entryIt) { if (entryIt->getUserId() == similarUserId) { double averageRating = averageUserRatings[similarUserId].averageRating; sumOfUserRatings += curCorrelation * ((double)entryIt->getRating() - averageRating); sumOfUserCorrelations += fabs(curCorrelation); break; } } } // Now, iterate over all the similar users again. double minMovieCorrelation = movieCorrelationsBoth[8000].correlation; for (userIt = userCorrelations.begin(); userIt != userCorrelations.end(); ++userIt) { int altUserId = userIt->first; double altUserCorrelation = userIt->second; // Look at the top 8000 correlated movies and see which // ones this user has rated. vector altUserRatings = userToEntriesMap.find(altUserId)->second; int i = 0; vector::const_iterator altUserIt; for(altUserIt = altUserRatings.begin(); altUserIt != altUserRatings.end(); ++altUserIt) { int altMovieId = altUserIt->getMovieId(); double curMovieCorrelation = movieCorrelations[altMovieId]; // If this user has rated this movie... if (curMovieCorrelation > minMovieCorrelation) { double curAltCorrelation = 1.0 / sqrt((1.0/(altUserCorrelation*altUserCorrelation)) + (1.0/(curMovieCorrelation*curMovieCorrelation))); double averageRating = averageUserRatings[altUserId].averageRating; sumOfUserMovieRatings = curAltCorrelation * (altUserIt->getRating() - averageRating); sumOfUserMovieCorrelations += fabs(curAltCorrelation); } } } float rating = 0.0; if (sumOfUserCorrelations > 0.00001) { rating += (1.0 - delta) * lambda * (sumOfUserRatings / sumOfUserCorrelations); } if (sumOfMovieCorrelations > 0.00001) { rating += (1.0 - delta) * (1.0 - lambda) * (sumOfMovieRatings / sumOfMovieCorrelations); } if (sumOfUserMovieCorrelations > 0.00001) { rating += delta * (sumOfUserMovieRatings / sumOfUserMovieCorrelations); } return rating + averageUserRating; } int curUserId; int curMovieId; vector movieEntries; const double delta; const double lambda; vector altMovieEntries; vector altMovieCorrelations; }; int main() { string line; int movieId; bool changedId; bool isProbe = true; //UserBasedCorrelation algorithm; //MovieBasedCorrelation algorithm; UserAndMovieBasedCorrelation algorithm; vector entries; setEntriesFromMovieAverageRatings(isProbe, averageMovieRatings); appendEntriesFromUserAverageRatings(isProbe, averageUserRatings); if (isProbe) { appendEntriesFromProbeFile(probeEntriesMap); } algorithm.initialize(isProbe); ifstream file; if (isProbe) { file.open(probeFileName.c_str()); } else { file.open(predictionFileName.c_str()); } // lines to skip const int LINES_TO_SKIP = 768368; ofstream outFile; if (isProbe) { if (LINES_TO_SKIP > 0) { outFile.open(outputProbeFileName.c_str(), std::ios::app); } else { outFile.open(outputProbeFileName.c_str()); } } else { if (LINES_TO_SKIP > 0) { outFile.open(outputPredictionFileName.c_str(), std::ios::app); } else { outFile.open(outputPredictionFileName.c_str()); } } for (int i = 0; i < LINES_TO_SKIP; ++i) { getline(file, line); Entry e = parseChallengeLine(line, movieId, changedId, !isProbe); } if (LINES_TO_SKIP > 0) { cout << "Finished resuming from " << LINES_TO_SKIP << " lines" << endl; } getline(file, line); while (!file.eof()) { Entry e = parseChallengeLine(line, movieId, changedId, !isProbe); if (changedId) { outFile << movieId << ":" << endl; } else { float rating = algorithm.getExpectedRating(e, isProbe); if (rating < 1.0) { rating = 1.0; } else if (rating > 5.0) { rating = 5.0; } outFile << rating << endl; } getline(file, line); } return 0; }