#include #include #include #include #include "Structures.h" #include "util.h" using namespace std; std::ostream& operator<< (std::ostream& os, const Entry& entry) { os << "userId: " << entry.getUserId() << ", rating: " << (int) entry.getRating() << ", movieId: " << entry.getMovieId(); return os; } void addUserAverageRatingLine(string line, std::map& ratingMap) { // First find the userId, then parse the rest of the line. string::size_type firstComma = line.find(","); if (firstComma != string::npos) { int userId = from_string(line.substr(0, firstComma)); AverageRating rating = parseMovieAverageRatingLine(line.substr(firstComma+1)); ratingMap.insert(std::make_pair(userId, rating)); return; } else { cout << "addUserAverageRatingLine: Didn't find first comma in line: " << line << endl; return; } } void addUserCorrelationLine(string line, std::map& correlations) { // First find the userId, then parse the rest of the line. string::size_type firstComma = line.find(","); if (firstComma != string::npos) { int userId = from_string(line.substr(0, firstComma)); double correlation = from_string(line.substr(firstComma+1)); correlations.insert(std::make_pair(userId, correlation)); return; } else { cout << "addUserCorrelationLine: Didn't find first comma in line: " << line << endl; return; } } AverageRating parseMovieAverageRatingLine(string line) { string::size_type firstComma = line.find(","); if (firstComma != string::npos) { double averageRating; averageRating = from_string(line.substr(0, firstComma)); string::size_type secondComma = line.find(",", firstComma + 1); if (secondComma != string::npos) { int numRatings = from_string(line.substr(firstComma + 1, secondComma)); double stdDev = from_string(line.substr(secondComma + 1)); return AverageRating(averageRating, numRatings, stdDev); } else { cout << "parseMovieAverageRatingLine: Didn't find second comma in line: " << line << endl; return AverageRating(0.0, 0, 0.0); } } else { cout << "parseMovieAverageRatingLine: Didn't find first comma in line: " << line << endl; return AverageRating(0.0, 0, 0.0); } } void setEntriesFromMovieAverageRatings(bool isProbe, std::vector& entries) { entries.clear(); // Insert something in the 0th slot. entries.push_back(AverageRating()); string fileName; if (isProbe) { fileName = probeDataDir; } else { fileName = predictionDataDir; } fileName += "movieAverageRatings.txt"; ifstream file(fileName.c_str()); if (!file) { cout << "ERROR: setEntriesFromMovieAverageRatings: couldn't open " << fileName.c_str() << endl; } string line; getline(file, line); while (!file.eof()) { entries.push_back(parseMovieAverageRatingLine(line)); getline(file, line); } } void appendEntriesFromUserAverageRatings(bool isProbe, std::map& entries) { string fileName; if (isProbe) { fileName = probeDataDir; } else { fileName = predictionDataDir; } fileName += "userAverageRatings.txt"; ifstream file(fileName.c_str()); if (!file) { cout << "ERROR: appendEntriesFromUserAverageRatings: couldn't open " << fileName.c_str() << endl; } string line; getline(file, line); while (!file.eof()) { addUserAverageRatingLine(line, entries); getline(file, line); } } void appendDataFromMovieCorrelations(bool isProbe, int movieId, std::vector& correlations) { string fileToOpen = "movieCorrelations/" + formatNumber(movieId/1000, 2) + "/" + formatNumber(movieId, 7) + ".txt"; if (isProbe) { fileToOpen = "probe/" + fileToOpen; } correlations.clear(); correlations.reserve(numMovies + 1); // Insert something in the 0th slot. correlations.push_back(0.00); ifstream file(fileToOpen.c_str()); if (!file) { cout << "ERROR: appendDataFromMovieCorrelations: couldn't open " << fileToOpen.c_str() << endl; } string line; getline(file, line); while (!file.eof()) { correlations.push_back(from_string(line)); getline(file, line); } file.close(); if (correlations.size() != numMovies + 1) { cout << "Got wrong number of entries from correlation file " << fileToOpen << "! (expected " << numMovies + 1 << ", got " << correlations.size() << ")" << endl; } } void appendDataFromMovieCorrelations(bool isProbe, int movieId, std::vector& correlations) { string fileToOpen = "movieCorrelations/" + formatNumber(movieId/1000, 2) + "/" + formatNumber(movieId, 7) + ".txt"; if (isProbe) { fileToOpen = "probe/" + fileToOpen; } correlations.clear(); correlations.reserve(numMovies); ifstream file(fileToOpen.c_str()); if (!file) { cout << "ERROR: appendDataFromMovieCorrelations: couldn't open " << fileToOpen.c_str() << endl; } string line; getline(file, line); int mId = 1; while (!file.eof()) { correlations.push_back(MovieCorrelation(mId, from_string(line))); ++mId; getline(file, line); } file.close(); if (correlations.size() != numMovies) { cout << "Got wrong number of entries from correlation file " << fileToOpen << "! (expected " << numMovies << ", got " << correlations.size() << ")" << endl; } } void appendDataFromUserCorrelations(bool isProbe, int userId, std::map& correlations) { string fileToOpen = "userCorrelations/" + formatNumber(userId%100, 2) + "/" + formatNumber(userId, 7) + ".txt"; if (isProbe) { fileToOpen = "probe/" + fileToOpen; } correlations.clear(); ifstream file(fileToOpen.c_str()); string line; if (!file) { cout << "ERROR: appendDataFromUserCorrelations: couldn't open " << fileToOpen.c_str() << endl; exit(1); } getline(file, line); while (!file.eof()) { addUserCorrelationLine(line, correlations); getline(file, line); } file.close(); } void appendEntriesFromProbeFile(std::map >& entries) { ifstream file(probeFileName.c_str()); if (!file) { cout << "ERROR: appendEntriesFromProbeFile: couldn't open " << probeFileName.c_str() << endl; } string line; int movieId = -1; bool changedId; getline(file, line); while (!file.eof()) { Entry e = parseChallengeLine(line, movieId, changedId, false); if (!changedId) { entries[movieId].push_back(e); } getline(file, line); } } Entry parseChallengeLine(string line, int& movieId, bool& changedId, bool hasDates) { if (line[line.size() - 1] == ':') { // This is a movie line. string restOfLine = line.substr(0, line.size() - 1); movieId = from_string(restOfLine); changedId = true; return Entry(0,0,0); } changedId = false; if (hasDates) { string::size_type firstComma = line.find(","); if (firstComma != string::npos) { int userId; userId = from_string(line.substr(0, firstComma)); return Entry(userId, 0, movieId); } else { cout << "Didn't find first comma in line: " << line << endl; return Entry(0,0,0); } } else { int userId = from_string(line); if (userId == 2466194 && movieId == 9939) { cout << "Got bad entry from line: " << line; } return Entry(from_string(line), 0, movieId); } } void filterEntries(vector& entries, map >& probeEntriesMap) { vector::const_iterator probeIt; vector::iterator it; int movieId = entries[0].getMovieId(); vector& probeEntries = probeEntriesMap[movieId]; for (probeIt = probeEntries.begin(); probeIt != probeEntries.end(); ++probeIt) { it = std::find_if(entries.begin(), entries.end(), ResultComparator(*probeIt)); if (it != entries.end()) { entries.erase(it); } else { cout << "Couldn't find entry for movieId " << movieId << "! " << *probeIt << endl; abort(); } } } Entry parseRatingLine(string line, int& movieId) { string::size_type firstComma = line.find(","); if (firstComma != string::npos) { string::size_type secondComma = line.find(",", firstComma + 1); if (secondComma != string::npos) { int userId = from_string(line.substr(0, firstComma)); char rating = from_string(line.substr(firstComma+1, secondComma)); return Entry(userId, rating, movieId); } else { cout << "parseRatingLine: Didn't find second comma in line: " << line << endl; return Entry(0,0,0); } } else { cout << "parseRatingLine: Didn't find first comma in line: " << line << endl; return Entry(0,0,0); } } void appendEntriesFromMovieFile(int movieId, std::vector& entries, bool isBinary) { string fileName = makeTrainingFileName(movieId); { ifstream file(fileName.c_str()); if (!file) { cout << "ERROR: appendEntriesFromMovieFile: couldn't open " << fileName.c_str() << endl; cout << "movieId was " << movieId << endl; exit(1); } if (isBinary) { union { char charRep[4]; int intRep; } charIntUnion; int userId; char rating; file.read(charIntUnion.charRep, 4); while (!file.eof()) { userId = charIntUnion.intRep; file.read(&rating, 1); entries.push_back(Entry(userId, rating, movieId)); file.read(charIntUnion.charRep, 4); } } else { string line; // Read the "header" line. getline(file, line); getline(file, line); while (!file.eof()) { entries.push_back(parseRatingLine(line, movieId)); getline(file, line); } } } } string makeTrainingFileName(int num) { return binaryEntriesDir + formatNumber(num/1000, 2) + "/" + trainingFilePrefix + formatNumber(num, 7) + trainingFileSuffix; } double calculateStdDev(const std::vector& entries, double average) { int numEntries = entries.size(); double sumSquareDist = 0.0; vector::const_iterator pos; for (pos = entries.begin(); pos != entries.end(); ++pos) { sumSquareDist += (pos->getRating() - average) * (pos->getRating() - average); } return sqrt(sumSquareDist/((double) numEntries)); }