Skeleton to load in netflixprize values
by Vershun on Oct.10, 2007, under Computers
It’s probably not the best code, but it doesn’t leave too large of a memory footprint and gives you nice access to the data.
This was compiled with Dev-C++ but should be find under Linux if you change the system(“pause”);. Oh and the filepath, of course. WordPress wreaked havoc on the code hopefully it works (I doubt anyone will use it anyway this is mainly for my own personal giggles).
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
//Constants
const int NUM_USERS = 480189;
const int NUM_MOVIES = 17770;
const int HASH_MOD = 500000;
/*** training grid
* Matrix is sparsely populated and will get stack overflows if I try to fill
* it with a static array. Use a doubly linked list for rows and columns (movies
* and users, respectively).
*/
typedef struct rating_node
{
int movieid, userid;
char rating; //char to save space, static cast
struct rating_node *next_movie; //This traverses the row, goes to next user in rating matrix
struct rating_node *next_user; //This traverses the column, use this for going down movie list
} rating;
//for user hash table
typedef struct hash_node
{
int userid;
struct hash_node *next_uid; //This traverses the hash nodes, use this for finding correct userid
struct rating_node *user_col; //This is a pointer to the head node for the user column
} hash;
//Prototypes
//Load Netflix data into arrays
void buildMatrixFromData(rating **movies, hash **users, char *trainingSetPath); //where movies and users are the head node arrays
//Training algorithm
static inline void train_set(int user, int movie, double rating);
//Add a node to the movie row
static inline void addToMovieRow(rating **head, rating *newnode);
//Add a new user on user column
void addToUserCol(hash ***head, rating *newnode);
//Debugging prototypes
void printMovieRow(rating *headnode);
void printUserColumn(hash **headnode, int userid);
void printNode(rating *node);
int main()
{
//build the head node vectors
rating *movies[NUM_MOVIES]; //static array of linked list elements
hash *users[HASH_MOD]; //This is a hash table with linked heads that go to the linked list user columns.
//Note: I didn't check for relative primes (I'm pissed enough about recoding but performance was terrible).
buildMatrixFromData(movies, users, "C:\netflix\training_set\training_set");
system("pause");
}
void buildMatrixFromData(rating **movies, hash **users, char *trainingSetPath)
{
int mid = 1;
int uid, rat, idx;
for(mid = 1; mid < NUM_MOVIES; mid++)
{
FILE *fp;
char filename[strlen(trainingSetPath) + 15];
char num[12];
strcpy(filename, trainingSetPath);
sprintf(num, "mv_%07d.txt", mid);
strcat(filename, num);
fp = fopen(filename, "r");
//the first line is junk, skip it
fscanf(fp, "%d:", &uid);
while(fscanf(fp, "%d,%d,%*s", &uid, &rat) != EOF)
{
//Create new node for rating
rating *newrat;
if ((newrat = (rating*)malloc(sizeof(*newrat))) == NULL)
{
exit(EXIT_FAILURE);
}
newrat->movieid = mid;
newrat->rating = rat;
newrat->userid = uid;
newrat->next_movie = newrat->next_user = NULL;
addToMovieRow(&movies[mid-1], newrat);
addToUserCol(&users, newrat);
}
fclose(fp);
//For testing movie vector
//printMovieRow(movies[mid-1]);
}
//For testing hash table
//printUserColumn(users, 1806515);
}
static inline void addToMovieRow(rating **head, rating *newnode)
{
//base case
if(head == NULL)
{
*head = newnode;
}
else
{
newnode->next_user = *head;
*head = newnode;
}
}
void addToUserCol(hash ***head, rating *newnode)
{
//use hash
hash **hashhead = &((*head)[newnode->userid % HASH_MOD]);
hash *tmp = (*head)[newnode->userid % HASH_MOD];
//base case
if(tmp == NULL)
{
//Create the new hashnode
hash *newhash = (hash*)malloc(sizeof(*newhash));
newhash->next_uid = NULL;
newhash->user_col = newnode;
newhash->userid = newnode->userid;
*hashhead = newhash;
}
else
{
while(tmp != NULL)
{
if(tmp->userid == newnode->userid)
{
newnode->next_movie = tmp->user_col;
tmp->user_col = newnode;
return;
}
tmp = tmp->next_uid;
}
//userid not in hash list
hash *newhash = (hash*)malloc(sizeof(*newhash));
newhash->next_uid = *hashhead;
newhash->user_col = newnode;
newhash->userid = newnode->userid;
*hashhead = newhash;
}
}
void printMovieRow(rating *headnode)
{
if(headnode == NULL)
{
printf("Head node empty!n");
}
else
{
rating *tmp = headnode;
while(tmp != NULL)
{
printNode(tmp);
tmp = tmp->next_user;
}
}
}
void printUserColumn(hash **headnode, int userid)
{
hash *head = headnode[userid % HASH_MOD];
if(head == NULL)
{
printf("Head node empty!n");
}
else
{
hash *tmp = head;
while(tmp != NULL)
{
if(tmp->userid == userid)
{
//Found our head, traverse down it printing nodes
rating *rat = tmp->user_col;
while(rat != NULL)
{
printNode(rat);
rat = rat->next_movie;
}
}
tmp = tmp->next_uid;
}
}
}
void printNode(rating *node)
{
printf("mid: %d, rating: %d, uid: %dn", node->movieid, node->rating, node->userid);
}
Note on the data structures: You can basically think of this as a matrix. The rows are the movies and you can access the linked list of users through the static array “movies.” The columns require a bit of work. For speed purposes (and because they’re not sequential) they’re stored as a hash table. To get to a specific user, mod the user’s id by HASH_MOD (this value is fairly arbitrary, I just went for a high number so there should be minimal collisions). This will get you the head node of the hash linked list. Traverse through using next_uid checking the userid in the hash node until you find yours. The pointer “user_col” points to the head rating node for that user.
Just a skeleton, if anyone has any other ideas or suggestions be sure to hit me up, this was just a quick, whirlwind approach I had.