#!/usr/bin/perl use 5.12.3; use strict; use warnings; use Cwd qw(abs_path cwd); use Digest::MD5 qw(md5_hex); use IO::Handle qw(autoflush); use File::Basename qw(basename dirname); #use File::Slurp qw(read_file); use diagnostics; use threads qw(yield); use threads::shared; use Thread::Queue; use Thread::Semaphore; #use Fcntl qw(:flock); use POSIX qw(SIGINT); use POSIX qw(ceil); chomp(my $cores = `grep -c ^processor /proc/cpuinfo`); my (@lib, $mode); # Path to and name of log file to be used for logging. my $logf = "$ENV{HOME}/md5db.log"; # Delimiter used for database my $delim = "\t\*\t"; # Array for storing the actual arguments used by the script internally. # Might be useful for debugging. my @cmd = (basename($0)); # Name of database file. my $db = 'md5.db'; # Clear screen command. my $clear = `clear && echo`; # Creating a hash that will store the names of files that are # too big to fit into RAM. We'll process them last. my %large; # Creating a few shared variables. # %err will be used for errors # $n will be used to count the number of files processed # %md5h is the database hash my %err :shared; my $n :shared = 0; my %md5h :shared; my %file_contents :shared; my $stopping :shared = 0; my $file_stack :shared = 0; my $busy :shared = 0; my @parts :shared = (0) x 2; my $disk_size = 500000000; # This will be used to control access to the logger subroutine. my $semaphore = Thread::Semaphore->new(); POSIX::sigaction(SIGINT, POSIX::SigAction->new(\&handler)) || die "Error setting SIGINT handler: $!\n"; # Creating a custom POSIX signal handler. # First we create a shared variable that will work as a SIGINT switch. # Then we define the handler subroutine. # If ^C is pressed for the first time, # trip the switch and let the current threads finish. # If ^C is pressed a second time, quit directly. # Each subroutine to be used for starting threads will have to # take notice of the state of the $saw_sigint variable. my $saw_sigint :shared = 0; sub handler { if ($saw_sigint == 1) { logger('int', $n); hash2file(); exit; } else { $saw_sigint = 1; } } # Open file handle for the log file open(my $LOG, '>>', $logf) or die "Can't open '$logf': $!"; # Make the $LOG file handle unbuffered for instant logging. $LOG->autoflush(1); # Duplicate STDERR as a regular file handle open(my $SE, ">&STDERR") or die "Can't duplicate STDERR: $!"; ### Subroutine for printing usage instructions sub usage { my $s = basename($0); say <<"HELP" Usage: $s [options] [directory 1] .. [directory N] -help Print this help message. -import Import MD5 sums to the database from already existing \*.MD5 files in each directory. -index Index new files in each directory. -test Test the MD5 sums of the files in the database to see if they've changed. HELP } # This loop goes through the argument list as passed to the script # by the user when ran. foreach my $arg (@ARGV) { # If argument starts with a dash '-', interprete it as an option if ($arg =~ /^-/) { given ($arg) { # When '-import', set script mode to 'import', and call # the md5import subroutine later. when (/^-import$/) { if (!$mode) { push(@cmd, $arg); $mode = 'import'; } } # When '-help', set script mode to 'help', and print # usage instructions later. when (/^-help$/) { if (!$mode) { push(@cmd, $arg); $mode = 'help'; } } # When '-index', set script mode to 'index', and call # the md5index subroutine later. when (/^-index$/) { if (!$mode) { push(@cmd, $arg); $mode = 'index'; } } # When '-test', set the script mode to 'test', and call # the md5test subroutine later. when (/^-test$/) { if (!$mode) { push(@cmd, $arg); $mode = 'test'; } } } # If argument is a directory, include it in the @lib array } elsif (-d $arg) { push(@lib, $arg); push(@cmd, $arg); } } # If no switches were used, print usage instructions if (!@lib || !$mode || $mode eq 'help') { usage; exit; } #say "@cmd\n"; # Subroutine for controlling the log file # Applying a semaphore so multiple threads won't try to # access it at once, just in case ;-) sub logger { $semaphore->down(); my($arg, $sw, @fn, $n); # Creating a variable to hold the current time. my $now = localtime(time); # Array of accepted switches to this subroutine my @larg = qw{start int gone corr diff end}; # Loop through all the arguments passed to this subroutine # Perform checks that decide which variable the arguments are to # be assigned to. CHECK: while (@_) { $arg = shift(@_); # If $arg is a switch, set the $sw variable and start # the next iteration of the CHECK loop. foreach (@larg) { if ($_ eq $arg) { $sw = $arg; next CHECK; } } # If $arg is a number assign it to $n, if it's a file # add it to @fn. if ($arg =~ /^[0-9]*$/) { $n = $arg; } else { push(@fn, $arg); } } given ($sw) { # Starts writing the log. when ('start') { say $LOG "\n**** Logging started on $now ****\n"; say $LOG "Running script in '$mode' mode.\n"; } # When the script is interrupted by user pressing ^C, # say so in STDOUT, close the log. when ('int') { say "\nInterrupted by user!\n"; say $LOG $n . " file(s) were tested."; say $LOG "\n**** Logging ended on $now ****\n"; close $LOG or die "Can't close '$LOG': $!"; } # Called when file has been deleted or moved. when ('gone') { say $LOG $fn[0] . "\n\t" . "has been (re)moved.\n"; $err{$fn[0]} = "has been (re)moved.\n"; } # Called when file has been corrupted. when ('corr') { say $LOG $fn[0] . "\n\t" . "has been corrupted.\n"; $err{$fn[0]} = "has been corrupted.\n"; } when ('diff') { say $LOG $fn[0] . "\n\t" . "doesn't match the hash in database.\n"; $err{$fn[0]} = "doesn't match the hash in database.\n"; } # Called when done, and to close the log. # If no errors occurred write "Everything is OK!" to the log. # If errors occurred print the %err hash. # Either way, print number of files processed. when ('end') { if (!%err) { say $LOG "\nEverything is OK!\n"; } else { say "\n**** Errors Occurred ****\n"; foreach my $fn (sort keys %err) { say $SE $fn . "\n\t" . $err{$fn}; } } say $LOG $n . " file(s) were tested.\n" if ($n); say $LOG "\n**** Logging ended on $now ****\n"; close $LOG or die "Can't close '$LOG': $!"; } } $semaphore->up(); } # Subroutine for reading a database file into the database hash. # This is the first subroutine that will be executed and all others # depend upon it, cause without it we don't have a # database hash to work with. sub file2hash { # The format string which is used for parsing the database file my $format = qr/^.*\t\*\t[[:alnum:]]{32}$/; my (@dbfile, @gone); # Open the database file and read it into the @dbfile variable open(MD5DB, '<', $db) or die "Can't open '$db': $!"; chomp (@dbfile = ()); close(MD5DB) or die "Can't close '$db': $!"; # Loop through all the lines in the database file and split # them before storing in the database hash. # Also, print each line to STDOUT for debug purposes foreach my $line (@dbfile) { # If current line matches the proper database file format, # continue. if ($line =~ /$format/) { my ($fn, $hash) = (split(/\Q$delim/, $line)); # If $fn is a real file and not already in the hash, # continue. if (-f $fn && ! $md5h{$fn}) { $md5h{$fn} = $hash; say "$fn". "$delim" . "$hash"; # Saves the names of deleted or moved files in '@gone' # for printing at the end of this subroutine. } else { push(@gone, $fn); } } } # Clears the screen, thereby scrolling past the database file print print $clear; # Loops through the @gone array and logs every file name # that's been deleted or moved. foreach my $fn (@gone) { logger('gone', $fn); } } # Subroutine for printing the database hash to the database file sub hash2file { open(MD5DB, '>', "$db") or die "Can't open '$db': $!"; # Loops through all the keys in the database hash and prints # the entries (divided by the $delim variable) to the database file. foreach my $k (sort keys %md5h) { say MD5DB $k . $delim . $md5h{$k}; } close(MD5DB) or die "Can't close '$db': $!"; } # Subroutine for finding files # Finds all the files inside the directory name passed to it, # and sorts the output before storing it in the @files array. sub getfiles { my $dn = shift; my @files; open(FIND, '-|', qq(find -L "$dn" -type f -name "*")) or die "Can\'t run 'find': $!"; while (my $fn = ()) { chomp($fn); $fn =~ s($dn/)(); push(@files, $fn) if (-f $fn && $fn ne basename($db)); } close(FIND) or die "Can't close 'find': $!"; return @files; } # Subroutine for finding and parsing *.MD5 files, adding the hashes # to the database hash and thereby also to the file. # It takes 1 argument: # (1) the thread queue sub md5import { my $q = shift; my ($fn, $hash, @fields, @lines); # The format string which is used for parsing the *.MD5 files. my $format = '^[[:alnum:]]{32}\s\*.*'; # Loop through the @files array. while (my $md5fn = $q->dequeue_nb()) { # If the file extension is *.MD5 in either upper- or # lowercase, continue. if ($md5fn =~ /.md5$/i) { # Open the *.MD5 file and read its contents to the # @lines array. open(MD5, '<', $md5fn) or die "Can't open '$md5fn': $!"; chomp(@lines = ()); close(MD5) or die "Can't close '$md5fn': $!"; # Loop to check that the format of the *.MD5 file really # is correct before proceeding. foreach my $line (@lines) { # If format string matches the line(s) in the *.MD5 # file, continue. if ($line =~ /$format/) { # Split the line so that the hash and file name go # into @fields array. # After that strip the path (if any) of the file # name, and prepend the path of the *.MD5 file to # it instead. # Store hash and file name in the $hash and $fn # variables for readability. @fields = split(/\s\Q*/, $line, 2); my $path = dirname($md5fn); $hash = $fields[0]; if ($path eq '.') { $fn = basename($fields[1]); } else { $fn = dirname($md5fn) . '/' . basename($fields[1]); } # Convert CR+LF newlines to proper LF to avoid # identical file names from being interpreted as # different. $fn =~ s/\r//; # Unless file name already is in the database hash, # print a message, add it to the hash. if (! $md5h{$fn} && -f $fn) { say "$fn" . "\n\t" . "Imported MD5 sum from '" . basename($md5fn) . "'.\n"; $md5h{$fn} = $hash; # If file name is not a real file, write to # the log. # If file name is in database hash but the # MD5 sum from the MD5 file doesn't match, # print to the log. } elsif (! -f $fn) { logger('gone', $fn); } elsif ($md5h{$fn} ne $hash) { logger('diff', $md5fn); } } } } } } sub md5sum { my $fn = shift; my $last = $parts[1]; # If $last is set it means we're dealing with a split file, so # set $busy to 1 for the other threads to fall asleep. We need to # stay in this thread when we're splitting files. So we create an # 'until' loop which will process all the parts until the last one. if ($last) { $busy = 1; my $md5 = Digest::MD5->new; my $done = 0; until ($done == $last) { if ($file_contents{$fn}) { $md5->add($file_contents{$fn}); lock($file_stack); $file_stack -= length($file_contents{$fn}); lock(%file_contents); delete($file_contents{$fn}); $done++; } yield(); } my $hash = $md5->hexdigest; $busy = 0; return $hash; } else { my $hash = md5_hex($file_contents{$fn}); lock($file_stack); $file_stack -= length($file_contents{$fn}); lock(%file_contents); delete($file_contents{$fn}); return $hash; } } # Subroutine to index the files # i.e calculate and store the MD5 sums in the database hash/file. # It takes 1 argument: # (1) the thread queue sub md5index { my $q = shift; my $tid = threads->tid(); # Loop through the thread que. LOOP2: while ((my $fn = $q->dequeue_nb()) || !$stopping) { if ($fn) { $md5h{$fn} = md5sum($fn); lock($n); $n++; } else { yield(); } # If the $saw_sigint variable has been tripped. # Quit this 'while' loop, thereby closing the thread. if ($saw_sigint == 1) { say "Closing thread: " . $tid; last; } } while (!$q->pending() && !$stopping) { yield(); goto(LOOP2) if (%large && !$busy); } } # Subroutine for testing to see if the MD5 sums # in the database file are correct (i.e. have changed or not). # It takes 1 argument: # (1) the thread queue sub md5test { my $q = shift; my $tid = threads->tid(); my ($oldmd5, $newmd5); # Loop through the thread queue. LOOP: while ((my $fn = $q->dequeue_nb()) || !$stopping) { if ($fn) { $newmd5 = md5sum($fn); $oldmd5 = $md5h{$fn}; # If the new MD5 sum doesn't match the one in the hash, # and file doesn't already exist in the %err hash, # log it and replace the old MD5 sum in the hash with # the new one. if ($newmd5 ne $oldmd5 && ! $err{$fn}) { logger('diff', $fn); $md5h{$fn} = $newmd5; } lock($n); $n++; } else { yield(); } # If the $saw_sigint variable has been tripped. # Quit this 'while' loop, thereby closing the thread. if ($saw_sigint == 1) { say "Closing thread: " . $tid; last; } } while (!$q->pending() && !$stopping) { yield(); goto(LOOP) if (%large && !$busy); } } sub md5flac { my $fn = shift; my (@req, $hash); if ($fn =~ /.flac$/i) { if (! @req) { chomp(@req = ( `which flac metaflac 2>&-` )); if (! $req[0] || ! $req[1]) { say "You need both 'flac' and 'metaflac' to test FLAC files!\n" . "Using normal test method...\n"; @req = '0'; return; } } unless ($req[0] = '0') { chomp($hash = `metaflac --show-md5sum "$fn" 2>&-`); if ($? != 0 && $? != 2) { logger('corr', $fn); return; } system('flac', '--totally-silent', '--test', "$fn"); if ($? != 0 && $? != 2) { logger('corr', $fn); return; } return $hash; } } } # Create the thread queue. my $q = Thread::Queue->new(); # Depending on which script mode is active, # set the @run array to the correct arguments. # This will be used to start the threads later. my @run; given ($mode) { when ('import') { @run = (\&md5import, $q); } when ('index') { @run = (\&md5index, $q); } when ('test') { @run = (\&md5test, $q); } } my @threads; foreach (1 .. $cores) { push(@threads, threads->create(@run)); } # This loop is where the actual action takes place # (i.e. where all the subroutines get called from) foreach my $dn (@lib) { if (-d $dn) { # Changing $dn to the absolute path. $dn = abs_path($dn); # Adding the current PATH to the $db variable. $db = "$dn/$db"; # Change directory to $dn. chdir($dn) or die "Can't change directory to '$dn': $!"; # Start logging. logger('start'); # If the database file is a real file, # store it in the database hash. file2hash() if (-f $db); given ($mode) { when ('import') { #say("Enqueueing files"); # Get all the file names and put them in the queue. foreach my $fn (sort(keys(%md5h))) { $q->enqueue($fn); } } when ('index') { #say("Enqueueing files"); # Get all the file names and put them in the queue. foreach my $fn (getfiles($dn)) { while ($file_stack >= $disk_size) { say("$file_stack > $disk_size, yielding"); yield(); } # Unless file name exists in the database hash, # continue. unless ($md5h{$fn}) { my $size = (stat($fn))[7]; if ($size < $disk_size) { say("Reading file $fn"); open(FILE, '<:raw', $fn) or die "Can't open '$fn': $!"; read(FILE, $file_contents{$fn}, $size); close(FILE) or die "Can't close '$fn': $!"; say("Queueing file $fn"); $file_stack += length($file_contents{$fn}); $q->enqueue($fn); } else { say("Skipping $fn"); $large{$fn} = 1; } } } } when ('test') { #say("Enqueueing files"); # Fetch all the keys for the database hash and put # them in the queue. foreach my $fn (sort(keys(%md5h))) { while ($file_stack >= $disk_size) { say("$file_stack > $disk_size, yielding"); yield(); } my $size = (stat($fn))[7]; if ($size < $disk_size) { say("Reading file $fn"); open(FILE, '<:raw', $fn) or die "Can't open '$fn': $!"; read(FILE, $file_contents{$fn}, $size); close(FILE) or die "Can't close '$fn': $!"; $file_stack += length($file_contents{$fn}); $q->enqueue($fn); } else { say("Skipping $fn"); $large{$fn} = 1; } } } } # This block is run after all the files that fit in RAM are # done processing. This deals with the large files that # don't fit in RAM. if (%large) { while ($file_stack > 0) { yield(); } foreach my $fn (sort(keys(%large))) { $parts[1] = ceil((stat($fn))[7] / $disk_size); my $last = $parts[1]; open(FILE, '<:raw', $fn) or die "Can't open '$fn': $!"; foreach my $n (1 .. $last) { while ($file_stack > 0) { yield(); } say "${fn}\nPart $n of $parts[1] done .."; if ($n == 1) { sysread(FILE, $file_contents{$fn}, $disk_size); lock($file_stack); $file_stack += length($file_contents{$fn}); $parts[0] = $n; $q->enqueue($fn); } elsif ($n == $last) { sysread(FILE, $file_contents{$fn}, $disk_size); lock($file_stack); $file_stack += length($file_contents{$fn}); $parts[0] = $n; } else { sysread(FILE, $file_contents{$fn}, $disk_size); lock($file_stack); $file_stack += length($file_contents{$fn}); $parts[0] = $n; } } close(FILE) or die "Can't close '$fn': $!"; foreach (@parts) { $_ = 0; } } } # use Digest::MD5; # $md5 = Digest::MD5->new; # $md5->add('foo', 'bar'); # $md5->add('baz'); # $digest = $md5->hexdigest; # print "Digest is $digest\n"; $stopping = 1; foreach my $t (threads->list()) { $t->join(); } #say("All threads joined"); # Print the hash to the database file and close the log hash2file(); logger('end', $n); } }