google-research
116 строк · 3.8 Кб
1#!/usr/bin/perl -w
2# Splits a dataset into a training and test dataset by moving the first rating
3# of the user from the training set into a test set.
4# The input data must have one line per rating record. Each rating record is
5# "<userid> <itemid> [...]". It is assumed that the dataset is sorted by user.
6# For each user the first record will be placed into a test set.
7# Three datasets are generated:
8# - train: The same as the input dataset but the first record of each user is
9# removed.
10# - test: Contains the first record of each user. Data format is the same as
11# train.
12# - test_neg: 100 negative items for each user. Has as many lines as the test
13# set. Format "(<userid> <itemid>) <negids>" where negids is a list
14# of item ids. The negative items are sampled from the set of all
15# items. For a given user, the negative items exclude all items
16# from <train> and <test> for that user.
17# The data generated by this script follows the same data format as in the
18# paper: He et al. Neural Collaborative filtering, WWW17. This script is used
19# to create the holdout split for hyperparameter tuning.
20# Note that the dataset in this paper is sorted in reverse time order, so the
21# first record in the file is the record that appeared the latest in time.
22#
23# Example:
24# ./create_hold_out.pl --in Data/ml-1m.train.rating \
25# --out_train Data/ml-1m.holdout.train.rating \
26# --out_test Data/ml-1m.holdout.test.rating \
27# --out_test_neg Data/ml-1m.holdout.test.negative
28use strict;
29use warnings;
30use Getopt::Long qw(GetOptions);
31
32my ($flag_in, $flag_out_train, $flag_out_test, $flag_out_test_neg);
33GetOptions( 'in:s' => \$flag_in,
34'out_train:s' => \$flag_out_train,
35'out_test:s' => \$flag_out_test,
36'out_test_neg:s' => \$flag_out_test_neg,
37) or die;
38
39# Get the number of items.
40my $num_items = -1;
41open my $IN, "<", $flag_in || die;
42for (<$IN>) {
43chomp;
44my ($user, $item, @rest) = split /\s/;
45$item += 0;
46if ($item > $num_items) {
47$num_items = $item;
48}
49}
50close $IN;
51$num_items += 1;
52print "Number of items: $num_items\n";
53
54# Write the results
55open $IN, "<", $flag_in || die;
56open my $OUT_TRAIN, ">", $flag_out_train || die;
57open my $OUT_TEST, ">", $flag_out_test || die;
58open my $OUT_TEST_NEG, ">", $flag_out_test_neg || die;
59my $prev_user = -9999;
60my $prev_user_test_item = -9999;
61my %train_items;
62for (<$IN>) {
63chomp;
64my $in_line = $_;
65my ($user, $item, @rest) = split /\s/;
66$user += 0;
67$item += 0;
68if ($prev_user != $user) {
69print {$OUT_TEST} $in_line, "\n";
70
71# Flush negs for previous user.
72if ($prev_user >= 0) {
73my %test_negs;
74while (scalar(keys(%test_negs)) < 100) {
75my $proposed_item = int(rand($num_items));
76if (($proposed_item != $prev_user_test_item) &&
77(!exists($train_items{$proposed_item})) &&
78(!exists($test_negs{$proposed_item}))) {
79$test_negs{$proposed_item} = 1;
80}
81}
82print {$OUT_TEST_NEG}
83"($prev_user, $prev_user_test_item)\t" .
84join("\t", keys(%test_negs)) . "\n";
85}
86$prev_user = $user;
87$prev_user_test_item = $item;
88%train_items = ();
89} else {
90print {$OUT_TRAIN} $in_line, "\n";
91$train_items{$item} = 1;
92}
93}
94
95# Flush negs for previous user.
96if ($prev_user >= 0) {
97my %test_negs;
98while (scalar(keys(%test_negs)) < 100) {
99my $proposed_item = int(rand($num_items));
100if (($proposed_item != $prev_user_test_item) &&
101(!exists($train_items{$proposed_item})) &&
102(!exists($test_negs{$proposed_item}))) {
103$test_negs{$proposed_item} = 1;
104}
105}
106print {$OUT_TEST_NEG}
107"($prev_user, $prev_user_test_item)\t" .
108join("\t", keys(%test_negs)) . "\n";
109}
110
111close $OUT_TEST_NEG;
112close $OUT_TEST;
113close $OUT_TRAIN;
114close $IN;
115
116__END__
117