Vershun’s Brain Dump

Skeleton to load in netflixprize values

by Vershun on Oct.10, 2007, under Computers

It’s probably not the best code, but it doesn’t leave too large of a memory footprint and gives you nice access to the data.

This was compiled with Dev-C++ but should be find under Linux if you change the system(“pause”);. Oh and the filepath, of course. WordPress wreaked havoc on the code hopefully it works (I doubt anyone will use it anyway this is mainly for my own personal giggles).

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

//Constants
const int NUM_USERS = 480189;
const int NUM_MOVIES = 17770;
const int HASH_MOD = 500000;

/*** training grid
*  Matrix is sparsely populated and will get stack overflows if I try to fill
*  it with a static array.  Use a doubly linked list for rows and columns (movies
*  and users, respectively).
*/
typedef struct rating_node
{
   int movieid, userid;
   char rating;  //char to save space, static cast
   struct rating_node *next_movie;  //This traverses the row, goes to next user in rating matrix
   struct rating_node *next_user;  //This traverses the column, use this for going down movie list
} rating;

//for user hash table
typedef struct hash_node
{
   int userid;
   struct hash_node *next_uid;  //This traverses the hash nodes, use this for finding correct userid
   struct rating_node *user_col;  //This is a pointer to the head node for the user column
} hash;

//Prototypes

//Load Netflix data into arrays
void buildMatrixFromData(rating **movies, hash **users, char *trainingSetPath); //where movies and users are the head node arrays
//Training algorithm
static inline void train_set(int user, int movie, double rating);
//Add a node to the movie row
static inline void addToMovieRow(rating **head, rating *newnode);
//Add a new user on user column
void addToUserCol(hash ***head, rating *newnode);

//Debugging prototypes
void printMovieRow(rating *headnode);
void printUserColumn(hash **headnode, int userid);
void printNode(rating *node);

int main()
{
    //build the head node vectors
    rating *movies[NUM_MOVIES];  //static array of linked list elements
    hash *users[HASH_MOD];  //This is a hash table with linked heads that go to the linked list user columns.
    //Note:  I didn't check for relative primes (I'm pissed enough about recoding but performance was terrible).

    buildMatrixFromData(movies, users, "C:\netflix\training_set\training_set");

    system("pause");
}

void buildMatrixFromData(rating **movies, hash **users, char *trainingSetPath)
{
    int mid = 1;
    int uid, rat, idx;

    for(mid = 1; mid < NUM_MOVIES; mid++)
    {
        FILE *fp;

        char filename[strlen(trainingSetPath) + 15];
        char num[12];
        strcpy(filename, trainingSetPath);
        sprintf(num, "mv_%07d.txt", mid);
        strcat(filename, num);

        fp = fopen(filename, "r");

        //the first line is junk, skip it
        fscanf(fp, "%d:", &uid);

        while(fscanf(fp, "%d,%d,%*s", &uid, &rat) != EOF)
        {
          //Create new node for rating
          rating *newrat;
          if ((newrat = (rating*)malloc(sizeof(*newrat))) == NULL)
          {
             exit(EXIT_FAILURE);
          }
          newrat->movieid = mid;
          newrat->rating = rat;
          newrat->userid = uid;
          newrat->next_movie = newrat->next_user = NULL;

          addToMovieRow(&movies[mid-1], newrat);

          addToUserCol(&users, newrat);
        }
        fclose(fp);

        //For testing movie vector
        //printMovieRow(movies[mid-1]);
    }

    //For testing hash table
    //printUserColumn(users, 1806515);
}

static inline void addToMovieRow(rating **head, rating *newnode)
{
     //base case
     if(head == NULL)
     {
             *head = newnode;
     }
     else
     {
         newnode->next_user = *head;
         *head = newnode;
     }
}

void addToUserCol(hash ***head, rating *newnode)
{
     //use hash
     hash **hashhead = &((*head)[newnode->userid % HASH_MOD]);
     hash *tmp = (*head)[newnode->userid % HASH_MOD];
     //base case
     if(tmp == NULL)
     {
         //Create the new hashnode
         hash *newhash = (hash*)malloc(sizeof(*newhash));
         newhash->next_uid = NULL;
         newhash->user_col = newnode;
         newhash->userid = newnode->userid;
         *hashhead = newhash;
     }
     else
     {
         while(tmp != NULL)
         {
             if(tmp->userid == newnode->userid)
             {
                 newnode->next_movie = tmp->user_col;
                 tmp->user_col = newnode;
                 return;
             }
             tmp = tmp->next_uid;
         }
         //userid not in hash list
         hash *newhash = (hash*)malloc(sizeof(*newhash));
         newhash->next_uid = *hashhead;
         newhash->user_col = newnode;
         newhash->userid = newnode->userid;
         *hashhead = newhash;
     }
}

void printMovieRow(rating *headnode)
{
     if(headnode == NULL)
     {
             printf("Head node empty!n");
     }
     else
     {
         rating *tmp = headnode;
         while(tmp != NULL)
         {
             printNode(tmp);
             tmp = tmp->next_user;
         }
     }
}

void printUserColumn(hash **headnode, int userid)
{
     hash *head = headnode[userid % HASH_MOD];
     if(head == NULL)
     {
             printf("Head node empty!n");
     }
     else
     {
         hash *tmp = head;
         while(tmp != NULL)
         {
             if(tmp->userid == userid)
             {
                //Found our head, traverse down it printing nodes
                rating *rat = tmp->user_col;
                while(rat != NULL)
                {
                    printNode(rat);
                    rat = rat->next_movie;
                }
             }
             tmp = tmp->next_uid;
         }
     }
}

void printNode(rating *node)
{
     printf("mid: %d,  rating: %d,  uid: %dn", node->movieid, node->rating, node->userid);
}

Note on the data structures: You can basically think of this as a matrix. The rows are the movies and you can access the linked list of users through the static array “movies.” The columns require a bit of work. For speed purposes (and because they’re not sequential) they’re stored as a hash table. To get to a specific user, mod the user’s id by HASH_MOD (this value is fairly arbitrary, I just went for a high number so there should be minimal collisions). This will get you the head node of the hash linked list. Traverse through using next_uid checking the userid in the hash node until you find yours. The pointer “user_col” points to the head rating node for that user.

Just a skeleton, if anyone has any other ideas or suggestions be sure to hit me up, this was just a quick, whirlwind approach I had.

Share and Enjoy:
  • Digg
  • del.icio.us
  • StumbleUpon
No comments for this entry yet...

Leave a Reply

Looking for something?

Use the form below to search the site:

Still not finding what you're looking for? Drop a comment on a post or contact us so we can take care of it!

Visit our friends!

A few highly recommended friends...