Source code for bigdl.dataset.movielens

#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os

import zipfile
import numpy as np

from bigdl.dataset import base

SOURCE_URL = 'http://files.grouplens.org/datasets/movielens/'
[docs]def read_data_sets(data_dir):
    """
    Parse or download movielens 1m  data if train_dir is empty.

    :param data_dir: The directory storing the movielens data
    :return: a 2D numpy array with user index and item index in each row 
    """
    WHOLE_DATA = 'ml-1m.zip'
    local_file = base.maybe_download(WHOLE_DATA, data_dir, SOURCE_URL + WHOLE_DATA)
    zip_ref = zipfile.ZipFile(local_file, 'r')
    extracted_to = os.path.join(data_dir, "ml-1m")
    if not os.path.exists(extracted_to):
        print("Extracting %s to %s" % (local_file, data_dir))
        zip_ref.extractall(data_dir)
        zip_ref.close()
    rating_files = os.path.join(extracted_to,"ratings.dat")

    rating_list = [i.strip().split("::") for i in open(rating_files,"r").readlines()]    
    movielens_data = np.array(rating_list).astype(int)
    return movielens_data 

[docs]def get_id_pairs(data_dir):
	movielens_data = read_data_sets(data_dir)
	return movielens_data[:, 0:2]

[docs]def get_id_ratings(data_dir):
	movielens_data = read_data_sets(data_dir)
	return movielens_data[:, 0:3]


if __name__ == "__main__":
    movielens_data = read_data_sets("/tmp/movielens/")