summaryrefslogtreecommitdiffstats
path: root/trimtrees.pl
blob: 8bbc728b83229daa17a48fa4b90b0effd321bedb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
#!/usr/bin/perl -w

use strict;
use Cwd;
use Digest::MD5 qw(md5);
use Dumpvalue;
use File::Compare qw(compare);
use File::Copy qw(cp);
use File::Find;
use File::Temp qw(tempfile);
use Getopt::Long;
our %Opt;
GetOptions(\%Opt, "maxlinks=i",) or die;

my @dirs = @ARGV or die "Usage: $0 OPTIONS directories";
our $DEBUG  = 0;
our $Signal = 0;
our %MD5;
our $Usedspc  = 0;
our $Savedspc = 0;

$SIG{INT} = sub {
    warn "Caught SIGINT; please stand by, I'm leaving as soon as possible...\n";
    $Signal++;
};
our $DV = Dumpvalue->new(
                         tick         => qq{"},
                         quoteHighBit => 1,
                         printUndef   => 1,
                        );
our $WD = Cwd::cwd;

sub fmt ($)
{
    local $_ = shift;
    s/(\d)(?=(\d{3})+$)/$1_/g;
    $_;
}

# $success = melt($some_file,$another)

# $some_file is always absolute, $another is always in the current
# directory

# melt dies on severe errors, returns 1 on success and 0 if it could
# not melt the files. It warns on suspect error conditions, but does
# not warn if they have no severe consequences.
sub melt ($$)
{
    my ($first, $basename) = @_;
    my ($fh, $tempfile) = tempfile("trimtrees1-XXXXXXXX", DIR => ".");
    unless ($tempfile)
    {
        die "Could not create a temporary file: $!";
    }
    unless (rename $basename, $tempfile)
    {
        warn sprintf("Cannot rename %s to %s (%s); Skipping.",
                     $DV->stringify($basename),
                     $DV->stringify($tempfile), $!,);
        unlink $tempfile or die "Could not unlink '$tempfile': $!";
        return 0;
    }
    unless (link $first, $basename)
    {
        my $link_err = $!;
        if (rename $tempfile, $basename)
        {
            # We could rename back and no harm should be done. Don't warn.
            return 0;
        }
        else
        {
            die sprintf(
                        "Could neither link %s to %s (%s) "
                          . "nor rename %s back to %s (%s).",
                        $DV->stringify($first),    $DV->stringify($basename),
                        $link_err,                 $DV->stringify($tempfile),
                        $DV->stringify($basename), $!
                       );
        }
    }
    close $fh or warn "Could not close the temporary filehandle: $!";
    unless (unlink $tempfile)
    {
        die sprintf("Could not unlink %s (was %s): %s",
                    $DV->stringify($tempfile),
                    $DV->stringify($basename), $!);
    }
    return 1;
}

{
    my %INODE;

    sub register ($$;$)
    {
        my ($md5, $cand, $candstat) = @_;
        $MD5{$md5} = $cand;
        my @stat = $candstat ? @$candstat : stat $cand;
        my $size = $stat[7];
        warn "size undefined"    unless defined $size;
        warn "usedspc undefined" unless defined $Usedspc;
        if ($DEBUG)
        {
            warn(sprintf "\nDEBUG: cand[%s]size[%s]stat[%s]",
                 $cand, $size, join(":", @stat));
        }
        return if $INODE{$stat[1]}++;    # don't count twice
        $Usedspc += $size;
    }
}

undef $/;
my %reported;
my $files         = 0;
my $dirs          = @dirs;
my $tl_dirs_todo  = 0;
my $tl_dirs_doing = 0;

sub xreport ()
{
    my $uniq_files = keys %MD5;
    printf(
           "\rtlds[%s]cur[%s]uniq[%s]fils[%s]spcused[%s]saved[%s]",
           map { fmt($_) } (
                            $tl_dirs_todo, $tl_dirs_doing, $uniq_files,
                            $files,        $Usedspc,       $Savedspc
                           )
          );
}

$| = 1;
for my $diri (0 .. $#dirs)
{
    my $root = $dirs[$diri];
    find(
        {
         wanted => sub {
             if ($Signal)
             {
                 $File::Find::prune = 1;
                 return;
             }
             if ($File::Find::name eq $root)
             {
                 my $td = $_;
                 opendir my ($dh), $td;
                 my (@tl) = grep { !/^\./ && -d "$td/$_" } readdir $dh;
                 $tl_dirs_todo = @tl;
             }
             elsif (-d)
             {
                 my $slashes = $File::Find::name =~ tr|/||;
                 if ($slashes == 1)
                 {
                     $tl_dirs_doing++;
                 }
             }
             return
               if
               -l; # relative links would need special treatment that does not pay off
             return unless -f _;
             return unless -s _;    # empty files more risk that files with
                                    # content and no gain
             $files++;
             my $basename = $_;
             my $fh;

             unless (open $fh, "<", $basename)
             {
                 warn sprintf("Cannot read %s (%s); Skipping.",
                              $DV->stringify($File::Find::name), $!,);
                 $Usedspc += -s $basename;
                 return;
             }
             my $data = <$fh>;
             close $fh;
             my $md5  = md5 $data;
             my $cand = $File::Find::name;
             if ($Opt{maxlinks})
             {
                 my (@maxlstat) = stat($cand);
                 if ($maxlstat[3] > $Opt{maxlinks})
                 {
                     # the case that we have to make a new file from a link
                     my ($fh, $tempfile) =
                       tempfile("trimtrees2-XXXXXXXX", DIR => ".");
                     unless ($tempfile)
                     {
                         die "Could not create a temporary file: $!";
                     }
                     unless (rename $cand, $tempfile)
                     {
                         die sprintf("Could not rename %s to %s: %s",
                                     $DV->stringify($cand),
                                     $DV->stringify($tempfile), $!);
                     }
                     cp $tempfile,
                       $cand
                       or die sprintf("Could not cp %s to %s: %s",
                                      $DV->stringify($tempfile),
                                      $DV->stringify($cand), $!,);
                     unlink $tempfile;
                     $Savedspc -= $maxlstat[7];
                 }
             }
             if (my $first = $MD5{$md5})
             {
                 unless (File::Spec->file_name_is_absolute($first))
                 {
                     $first = File::Spec->catfile($WD, $first);
                 }
                 my (@firststat) = stat($first);
                 die sprintf("illegal firststat[%s]first[%s]",
                             join(":", @firststat), $DV->stringify($first),)
                   unless $firststat[1];
                 my $different = compare $first, $basename;
                 if ($different != 0 && $File::Compare::VERSION < 1.1005)
                 {
                     # workaround bug # 37716 in File::Compare
                     $different = compare "$first\0", "$basename\0";
                 }
                 if ($different < 0)
                 {
                     warn sprintf("Cannot compare %s and %s (%s); Skipping.",
                                  $DV->stringify($first),
                                  $DV->stringify($cand), $!,);
                     goto XREPORT;    # some error occurred
                 }
                 die sprintf("Sensation, %s and %s are not equal with same MD5",
                             $DV->stringify($first), $DV->stringify($cand),)
                   if $different;
                 my (@candstat) = stat($basename);
                 goto XREPORT
                   unless $candstat[0] == $firststat[0]; # different file system
                 if ($candstat[1] == $firststat[1])
                 {                                       # already same inode
                     if (0 && $Opt{maxlinks} && $firststat[3] > $Opt{maxlinks})
                     {
                     }
                     else
                     {
                         goto XREPORT;
                     }
                 }
                 if ($Opt{maxlinks} && $firststat[3] >= $Opt{maxlinks})
                 {
                     register($md5, $cand, \@candstat)
                       if $candstat[3] < $Opt{maxlinks};
                 }
                 elsif (melt($first, $basename))
                 {
                     if ($candstat[3] == 1)
                     {    # we don't save space otherwise
                         $Savedspc += $firststat[7];
                     }
                 }
                 else
                 {
                     register($md5, $cand, \@candstat);
                 }
             }
             else
             {
                 register($md5, $cand);
             }
           XREPORT:
             return if $files % 100;
             xreport;
         },
         no_chdir => 1,
        },
        $root
        );
    last if $Signal;
}
xreport;
print "\nDONE\n";

__END__

=head1 NAME

trimtrees - traverse directories, find identical files, replace with hard links

=head1 SYNOPSIS

 trimtrees.pl OPTIONS directory...

 OPTIONS:

  --maxlinks N            limit the amount of links per file

=head1 DESCRIPTION

Traverse all directories named on the command line, compute MD5
checksums and find files with identical MD5. IF they are equal, do a
real comparison if they are really equal, replace the second of two
files with a hard link to the first one.

Special care is taken to cope with C<Too many links> error conditions.
The inode that is overbooked in such a way, is taken out of the pool
and replaced with the another one such that the minimum of files
needed is kept on disk.

The C<--maxlinks> option can be used to reduce the linkcount on all
files within a tree, thus preparing the tree for a subsequent call to
C<cp -al>. This operation can be thought of the reverse of the normal
trimtrees operation (--maxlinks=1 produces a tree without hard links).

=head1 SIGNALS

SIGINT is caught and the script stops as soon as the current file
is finished.

=head2 RISKS

The whole idea of replacing identical files with hard links has
inherent dangers. Once two files have turned into one inode other
processes may accidentally change both although they intend to alter
only one. Please consider if this can happen in your environment.

=cut

	Local Variables:
	mode: cperl
	cperl-indent-level: 2
	End: