#!/usr/bin/perl
#-------------------------
# Title:   find_duplicate_files.pl
# By:      john@stilen.com
# Date:    20080329
# Purpose: Find Duplcate files
# How:     search a directory recursively for files
#          compare md5 of each file
#          output files with same md5 in orgnaized fassion.
#
# Uses a hash of arrays
#
# Example of how data will be stored:
# %unique = (
#        9a4688f7fc2dceaa20d97fe14c606619 => [ "/dirA/file1", "/dirC/file2" ],
#        e768cef480108cae41e6b76ba2efa4a1 => [ "/dirB/file3", "/dirA/file4", "/dirB/file5" ],
#        8f6ca31f6295dd897c6df305f5df9203 => [ "/dirC/file6", "/dirB/file7", "/dirA/file8" ],
#      );
# Example output
# 0ee5731bcf50ea6f9d8dba1d025ea95a 2 times
#         /dirA/file1
#         /dirC/file2
# a1ef40defe3f8e28c9b194e624fc3f12 3 times
#         /dirB/file3
#         /dirA/file4
#	  /dirB/file5
# c79d3eb284049ce9e49a7e67e540624e
#         /dirC/file6
#	  /dirB/file7
#	  /dirA/file8
#-------------------------
#
# load the modules, Load The Modules, LOAD THE MODULES
#
use strict;      # used to keep me honest
use File::Find;  # used to traverse directory
use Digest::MD5; # used to find md5 of each file
#
# 1=print extra output, 0=minimum output
#
my $debug=0;
#
# hash containing all data, md5 is the key, value is array of file names.
#
my %unique=();
#
# If no arguments are given, use current dir
#
@ARGV = (".") unless @ARGV;
#
# Function calcuates md5 on files
# and populates the %unique
#
sub md5sum   {
  #
  # Unless a file, exit function
  #
  return unless -f ; 
  #
  # Assign file name to variable
  #
  my $file = $File::Find::name ;  
  #
  # Open file handle, or print warning and go to next
  #
  open(FILE, $_ ) or ( warn "Can't open $file: $!" && return );
  binmode(FILE);
  #
  # Calculate md5sum
  #
  my $number=Digest::MD5->new->addfile(*FILE)->hexdigest ; 
  #
  # Close file handle, or print warning and go to next
  #
  close(FILE)  or ( warn "Can't close $file: $!" && return );  
  #
  # Print values if debug is on;
  #
  if ( $debug == 1 ){ print "$number:$file\n"; }    
  #
  # Now Add this info to the hash %unique
  #
  if ( exists $unique{$number} ){
     #
     # If the md5 has been seen before, append to hash elements array
     #
     push @{ $unique{"$number"} }, "$file" ;
  } else {
     #
     # Otherwise Start a new hash element
     # And add the file name to the array
     #
     $unique{$number} = [ ( $file ) ];
  }
}
#
# Finally call find
#
find(\&md5sum, @ARGV);
#
# Print out the results
#
print "Size\tMd5 ID                            Ocurrances\n";
foreach my $x ( keys %unique ) {
     #
     # Add 1 to the number of elements in the array of file names.
     #
     my $y = $#{$unique{$x}} + 1;
     #]
     # Duplicate if $y is greater than 1
     #
     if ( $y gt 1 ){
         my $size = -s ${ $unique{$x}}[0];
         print "$size\t$x  $y times\n";
         for my $i ( @{ $unique{$x} } ){
             print "\t$i\n";
         }
    }
}
