{
  "nbformat": 4,
  "cells": [
    {
      "execution_count": null,
      "source": [
        "%matplotlib inline"
      ],
      "outputs": [],
      "metadata": {
        "collapsed": false
      },
      "cell_type": "code"
    },
    {
      "source": [
        "\n# prerpocess MovieLens dataset\n\n\nIn this script, we pre-process the MovieLens 10M Dataset to get the right format of contextual bandit\nalgorithms. This data set is released by GroupLens at 1/2009. Please fist download the dataset from\nhttp://grouplens.org/datasets/movielens/, then unzipped the file 'ml-1m.zip' to the examples folder.\n\n"
      ],
      "metadata": {},
      "cell_type": "markdown"
    },
    {
      "execution_count": null,
      "source": [
        "import pandas as pd\nimport numpy as np\nimport itertools\n\n\ndef movie_preprocessing(movie):\n    movie_col = list(movie.columns)\n    movie_tag = [doc.split('|') for doc in movie['tag']]\n    tag_table = {token: idx for idx, token in enumerate(set(itertools.chain.from_iterable(movie_tag)))}\n    movie_tag = pd.DataFrame(movie_tag)\n    tag_table = pd.DataFrame(tag_table.items())\n    tag_table.columns = ['Tag', 'Index']\n\n    # use one-hot encoding for movie genres (here called tag)\n    tag_dummy = np.zeros([len(movie), len(tag_table)])\n\n    for i in range(len(movie)):\n        for j in range(len(tag_table)):\n            if tag_table['Tag'][j] in list(movie_tag.iloc[i, :]):\n                tag_dummy[i, j] = 1\n\n    # combine the tag_dummy one-hot encoding table to original movie files\n    movie = pd.concat([movie, pd.DataFrame(tag_dummy)], 1)\n    movie_col.extend(['tag' + str(i) for i in range(len(tag_table))])\n    movie.columns = movie_col\n    movie = movie.drop('tag', 1)\n    return movie\n\n\ndef feature_extraction(data):\n    # actions: we use top 50 movies as our actions for recommendations\n    actions = data.groupby('movie_id').size().sort_values(ascending=False)[:50]\n    actions = list(actions.index)\n\n    # user_feature: tags they've watched for non-top-50 movies normalized per user\n    user_feature = data[~data['movie_id'].isin(actions)]\n    user_feature = user_feature.groupby('user_id').aggregate(np.sum)\n    user_feature = user_feature.drop(['movie_id', 'rating', 'timestamp'], 1)\n    user_feature = user_feature.div(user_feature.sum(axis=1), axis=0)\n\n    # streaming_batch: the result for testing bandit algrorithms\n    top50_data = data[data['movie_id'].isin(actions)]\n    top50_data = top50_data.sort('timestamp', ascending=1)\n    streaming_batch = top50_data['user_id']\n\n    # reward_list: if rating >=3, the user will watch the movie\n    top50_data['reward'] = np.where(top50_data['rating'] >= 3, 1, 0)\n    reward_list = top50_data[['user_id', 'movie_id', 'reward']]\n    reward_list = reward_list[reward_list['reward'] == 1]\n    return streaming_batch, user_feature, actions, reward_list\n\n\ndef main():\n    # read and preprocess the movie data\n    movie = pd.read_table('movies.dat', sep='::', names=['movie_id', 'movie_name', 'tag'], engine='python')\n    movie = movie_preprocessing(movie)\n\n    # read the ratings data and merge it with movie data\n    rating = pd.read_table(\"ratings.dat\", sep=\"::\",\n                           names=[\"user_id\", \"movie_id\", \"rating\", \"timestamp\"], engine='python')\n    data = pd.merge(rating, movie, on=\"movie_id\")\n\n    # extract feature from our data set\n    streaming_batch, user_feature, actions, reward_list = feature_extraction(data)\n    streaming_batch.to_csv(\"streaming_batch.csv\", sep='\\t', index=False)\n    user_feature.to_csv(\"user_feature.csv\", sep='\\t')\n    pd.DataFrame(actions, columns=['movie_id']).to_csv(\"actions.csv\", sep='\\t', index=False)\n    reward_list.to_csv(\"reward_list.csv\", sep='\\t', index=False)\n\n    action_context = movie[movie['movie_id'].isin(actions)]\n    action_context.to_csv(\"action_context.csv\", sep='\\t', index = False)\n\nif __name__ == '__main__':\n    main()"
      ],
      "outputs": [],
      "metadata": {
        "collapsed": false
      },
      "cell_type": "code"
    }
  ],
  "metadata": {
    "kernelspec": {
      "language": "python",
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "codemirror_mode": {
        "version": 3,
        "name": "ipython"
      },
      "mimetype": "text/x-python",
      "version": "3.4.3",
      "pygments_lexer": "ipython3",
      "file_extension": ".py",
      "name": "python",
      "nbconvert_exporter": "python"
    }
  },
  "nbformat_minor": 0
}