google-research

Форк
0
116 строк · 3.8 Кб
1
#!/usr/bin/perl -w
2
# Splits a dataset into a training and test dataset by moving the first rating
3
# of the user from the training set into a test set.
4
# The input data must have one line per rating record. Each rating record is
5
# "<userid> <itemid> [...]". It is assumed that the dataset is sorted by user.
6
# For each user the first record will be placed into a test set.
7
# Three datasets are generated:
8
# - train:    The same as the input dataset but the first record of each user is
9
#             removed.
10
# - test:     Contains the first record of each user. Data format is the same as
11
#             train.
12
# - test_neg: 100 negative items for each user. Has as many lines as the test
13
#             set. Format "(<userid> <itemid>) <negids>" where negids is a list
14
#             of item ids. The negative items are sampled from the set of all
15
#             items. For a given user, the negative items exclude all items
16
#             from <train> and <test> for that user.
17
# The data generated by this script follows the same data format as in the
18
# paper: He et al. Neural Collaborative filtering, WWW17. This script is used
19
# to create the holdout split for hyperparameter tuning.
20
# Note that the dataset in this paper is sorted in reverse time order, so the
21
# first record in the file is the record that appeared the latest in time.
22
#
23
# Example:
24
# ./create_hold_out.pl --in Data/ml-1m.train.rating \
25
#                      --out_train Data/ml-1m.holdout.train.rating \
26
#                      --out_test Data/ml-1m.holdout.test.rating \
27
#                      --out_test_neg Data/ml-1m.holdout.test.negative
28
use strict;
29
use warnings;
30
use Getopt::Long qw(GetOptions);
31

32
my ($flag_in, $flag_out_train, $flag_out_test, $flag_out_test_neg);
33
GetOptions( 'in:s' => \$flag_in,
34
            'out_train:s' => \$flag_out_train,
35
            'out_test:s' => \$flag_out_test,
36
            'out_test_neg:s' => \$flag_out_test_neg,
37
      ) or die;
38

39
# Get the number of items.
40
my $num_items = -1;
41
open my $IN, "<", $flag_in || die;
42
for (<$IN>) {
43
  chomp;
44
  my ($user, $item, @rest) = split /\s/;
45
  $item += 0;
46
  if ($item > $num_items) {
47
    $num_items = $item;
48
  }
49
}
50
close $IN;
51
$num_items += 1;
52
print "Number of items: $num_items\n";
53

54
# Write the results
55
open $IN, "<", $flag_in || die;
56
open my $OUT_TRAIN, ">", $flag_out_train || die;
57
open my $OUT_TEST, ">", $flag_out_test || die;
58
open my $OUT_TEST_NEG, ">", $flag_out_test_neg || die;
59
my $prev_user = -9999;
60
my $prev_user_test_item = -9999;
61
my %train_items;
62
for (<$IN>) {
63
  chomp;
64
  my $in_line = $_;
65
  my ($user, $item, @rest) = split /\s/;
66
  $user += 0;
67
  $item += 0;
68
  if ($prev_user != $user) {
69
    print {$OUT_TEST} $in_line, "\n";
70

71
    # Flush negs for previous user.
72
    if ($prev_user >= 0) {
73
      my %test_negs;
74
      while (scalar(keys(%test_negs)) < 100) {
75
        my $proposed_item = int(rand($num_items));
76
        if (($proposed_item != $prev_user_test_item) &&
77
            (!exists($train_items{$proposed_item})) &&
78
            (!exists($test_negs{$proposed_item}))) {
79
          $test_negs{$proposed_item} = 1;
80
        }
81
      }
82
      print {$OUT_TEST_NEG}
83
          "($prev_user, $prev_user_test_item)\t" .
84
          join("\t", keys(%test_negs)) . "\n";
85
    }
86
    $prev_user = $user;
87
    $prev_user_test_item = $item;
88
    %train_items = ();
89
  } else {
90
    print {$OUT_TRAIN} $in_line, "\n";
91
    $train_items{$item} = 1;
92
  }
93
}
94

95
# Flush negs for previous user.
96
if ($prev_user >= 0) {
97
  my %test_negs;
98
  while (scalar(keys(%test_negs)) < 100) {
99
    my $proposed_item = int(rand($num_items));
100
    if (($proposed_item != $prev_user_test_item) &&
101
        (!exists($train_items{$proposed_item})) &&
102
        (!exists($test_negs{$proposed_item}))) {
103
      $test_negs{$proposed_item} = 1;
104
    }
105
  }
106
  print {$OUT_TEST_NEG}
107
      "($prev_user, $prev_user_test_item)\t" .
108
      join("\t", keys(%test_negs)) . "\n";
109
}
110

111
close $OUT_TEST_NEG;
112
close $OUT_TEST;
113
close $OUT_TRAIN;
114
close $IN;
115

116
__END__
117

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.