- #!/usr/bin/perl
- use 5.12.3;
- use strict;
- use warnings;
- #use File::Slurp qw(read_file);
- use diagnostics;
- use threads::shared;
- use Thread::Queue;
- use Thread::Semaphore;
- #use Fcntl qw(:flock);
- my (@lib, $mode);
- # Path to and name of log file to be used for logging.
- my $logf = "$ENV{HOME}/md5db.log";
- # Delimiter used for database
- my $delim = "\t\*\t";
- # Array for storing the actual arguments used by the script internally.
- # Might be useful for debugging.
- my @cmd = (basename($0));
- # Name of database file.
- my $db = 'md5.db';
- # Clear screen command.
- my $clear = `clear && echo`;
- # Creating a hash that will store the names of files that are
- # too big to fit into RAM. We'll process them last.
- my %large :shared;
- # Creating a few shared variables.
- # %err will be used for errors
- # $n will be used to count the number of files processed
- # %md5h is the database hash
- my %err :shared;
- my $n :shared = 0;
- my %md5h :shared;
- my %file_contents :shared;
- my $stopping :shared = 0;
- my $file_stack :shared = 0;
- my $busy :shared = 0;
- my $disk_size = 1000000000;
- # This will be used to control access to the logger subroutine.
- my $semaphore = Thread::Semaphore->new();
- POSIX::sigaction(SIGINT, POSIX::SigAction->new(\&handler))
- # Creating a custom POSIX signal handler.
- # First we create a shared variable that will work as a SIGINT switch.
- # Then we define the handler subroutine.
- # If ^C is pressed for the first time,
- # trip the switch and let the current threads finish.
- # If ^C is pressed a second time, quit directly.
- # Each subroutine to be used for starting threads will have to
- # take notice of the state of the $saw_sigint variable.
- my $saw_sigint :shared = 0;
- sub handler {
- if ($saw_sigint == 1) {
- logger('int', $n);
- hash2file();
- exit;
- } else { $saw_sigint = 1; }
- }
- # Open file handle for the log file
- # Make the $LOG file handle unbuffered for instant logging.
- $LOG->autoflush(1);
- # Duplicate STDERR as a regular file handle
- ### Subroutine for printing usage instructions
- sub usage {
- my $s = basename($0);
- say <<"HELP"
- Usage: $s [options] [directory 1] .. [directory N]
- -help Print this help message.
- -double Check database for files that have identical
- hashes.
- -import Import MD5 sums to the database from already existing
- -test Test the MD5 sums of the files in the database to see if
- they've changed.
- HELP
- }
- # This loop goes through the argument list as passed to the script
- # by the user when ran.
- foreach my $arg (@ARGV) {
- # If argument starts with a dash '-', interprete it as an option
- if ($arg =~ /^-/) {
- given ($arg) {
- # When '-double', set script mode to 'double', and call
- # the md5double subroutine later.
- when (/^-double$/) {
- if (!$mode) { push(@cmd, $arg); $mode = 'double'; }
- }
- # the md5import subroutine later.
- when (/^-import$/) {
- }
- # When '-help', set script mode to 'help', and print
- # usage instructions later.
- when (/^-help$/) {
- if (!$mode) { push(@cmd, $arg); $mode = 'help'; }
- }
- # the md5index subroutine later.
- when (/^-index$/) {
- }
- # When '-test', set the script mode to 'test', and call
- # the md5test subroutine later.
- when (/^-test$/) {
- if (!$mode) { push(@cmd, $arg); $mode = 'test'; }
- }
- }
- # If argument is a directory, include it in the @lib array
- } elsif (-d $arg) { push(@lib, $arg); push(@cmd, $arg); }
- }
- # If no switches were used, print usage instructions
- if (!@lib || !$mode || $mode eq 'help')
- { usage; exit; }
- #say "@cmd\n";
- # Subroutine for controlling the log file
- # Applying a semaphore so multiple threads won't try to
- # access it at once, just in case ;-)
- sub logger {
- $semaphore->down();
- my($arg, $sw, @fn, $n);
- # Creating a variable to hold the current time.
- # Array of accepted switches to this subroutine
- # Loop through all the arguments passed to this subroutine
- # Perform checks that decide which variable the arguments are to
- # be assigned to.
- CHECK: while (@_) {
- # If $arg is a switch, set the $sw variable and start
- # the next iteration of the CHECK loop.
- foreach (@larg) {
- if ($_ eq $arg) { $sw = $arg; next CHECK; }
- }
- # If $arg is a number assign it to $n, if it's a file
- # add it to @fn.
- if ($arg =~ /^[0-9]*$/) { $n = $arg; }
- }
- given ($sw) {
- # Starts writing the log.
- when ('start') {
- say $LOG "\n**** Logging started on $now ****\n";
- say $LOG "Running script in '$mode' mode.\n";
- }
- # When the script is interrupted by user pressing ^C,
- # say so in STDOUT, close the log.
- when ('int') {
- say "\nInterrupted by user!\n";
- say $LOG $n . " file(s) were tested.";
- say $LOG "\n**** Logging ended on $now ****\n";
- }
- # Called when file has been deleted or moved.
- when ('gone') {
- say $LOG $fn[0] . "\n\t" . "has been (re)moved.\n";
- $err{$fn[0]} = "has been (re)moved.\n";
- }
- # Called when file has been corrupted.
- when ('corr') {
- say $LOG $fn[0] . "\n\t" .
- "has been corrupted.\n";
- $err{$fn[0]} = "has been corrupted.\n";
- }
- when ('diff') {
- say $LOG $fn[0] . "\n\t" .
- "doesn't match the hash in database.\n";
- $err{$fn[0]} = "doesn't match the hash in database.\n";
- }
- # Called when done, and to close the log.
- # If no errors occurred write "Everything is OK!" to the log.
- # If errors occurred print the %err hash.
- # Either way, print number of files processed.
- when ('end') {
- if (!%err) {
- say $LOG "\nEverything is OK!\n";
- } else {
- say "\n**** Errors Occurred ****\n";
- say $SE $fn . "\n\t" . $err{$fn};
- }
- }
- say $LOG $n . " file(s) were tested.\n" if ($n);
- say $LOG "\n**** Logging ended on $now ****\n";
- }
- }
- $semaphore->up();
- }
- # Subroutine for reading a database file into the database hash.
- # This is the first subroutine that will be executed and all others
- # depend upon it, cause without it we don't have a
- # database hash to work with.
- sub file2hash {
- # The format string which is used for parsing the database file
- my $format = qr/^.*\t\*\t[[:alnum:]]{32}$/;
- my (@dbfile, @gone);
- # Open the database file and read it into the @dbfile variable
- # Loop through all the lines in the database file and split
- # them before storing in the database hash.
- # Also, print each line to STDOUT for debug purposes
- foreach my $line (@dbfile) {
- # If current line matches the proper database file format,
- # continue.
- if ($line =~ /$format/) {
- # If $fn is a real file and not already in the hash,
- # continue.
- if (-f $fn && ! $md5h{$fn}) {
- $md5h{$fn} = $hash;
- say "$fn". "$delim" . "$hash";
- # Saves the names of deleted or moved files in '@gone'
- # for printing at the end of this subroutine.
- }
- }
- # Clears the screen, thereby scrolling past the database file print
- # Loops through the @gone array and logs every file name
- # that's been deleted or moved.
- foreach my $fn (@gone) { logger('gone', $fn); }
- }
- # Subroutine for printing the database hash to the database file
- sub hash2file {
- # Loops through all the keys in the database hash and prints
- # the entries (divided by the $delim variable) to the database file.
- say MD5DB $k . $delim . $md5h{$k};
- }
- }
- # Subroutine for finding files
- # Finds all the files inside the directory name passed to it,
- # and sorts the output before storing it in the @files array.
- sub getfiles {
- my @files;
- while (my $fn = (<FIND>)) {
- }
- }
- sub md5double {
- say "No database file. Run the script in 'index' mode first\n" .
- "to index the files.";
- exit;
- }
- # Loop through the %md5h hash and save the checksums as keys in a
- # new hash called %exists. Each of those keys will hold an
- # anonymous array with the matching file names.
- my $hash = $md5h{$fn};
- if (!$exists{"$hash"}) {
- $exists{"$hash"}->[0] = $fn;
- } else {
- }
- }
- # Loop through the %exists hash and print files that are identical,
- # if any.
- say "These files have the same hash (${hash}):";
- foreach my $fn (@{$exists{"$hash"}}) {
- say $fn;
- }
- say "";
- }
- }
- }
- # Subroutine for finding and parsing *.MD5 files, adding the hashes
- # to the database hash and thereby also to the file.
- # It takes 1 argument:
- # (1) the thread queue
- sub md5import {
- my ($fn, $hash, @fields, @lines);
- # The format string which is used for parsing the *.MD5 files.
- my $format = qr/^[[:alnum:]]{32}\s\*.*/;
- # If the file extension is *.MD5 in either upper- or
- # lowercase, continue.
- if ($md5fn =~ /.md5$/i) {
- # Open the *.MD5 file and read its contents to the
- # @lines array.
- # Loop to check that the format of the *.MD5 file really
- # is correct before proceeding.
- foreach my $line (@lines) {
- # If format string matches the line(s) in the *.MD5
- # file, continue.
- if ($line =~ /$format/) {
- # Split the line so that the hash and file name go
- # into @fields array.
- # After that strip the path (if any) of the file
- # name, and prepend the path of the *.MD5 file to
- # it instead.
- # Store hash and file name in the $hash and $fn
- # variables for readability.
- my $path = dirname($md5fn);
- $hash = $fields[0];
- if ($path eq '.') { $fn = basename($fields[1]); }
- else { $fn = dirname($md5fn)
- . '/' . basename($fields[1]); }
- # Convert CR+LF newlines to proper LF to avoid
- # identical file names from being interpreted as
- # different.
- $fn =~ s/\r//;
- # Unless file name already is in the database hash,
- # print a message, add it to the hash.
- if (! $md5h{$fn} && -f $fn) {
- say "$fn" . "\n\t" .
- "Imported MD5 sum from '" .
- basename($md5fn) .
- "'.\n";
- $md5h{$fn} = $hash;
- # If file name is not a real file, write to
- # the log.
- # If file name is in database hash but the
- # MD5 sum from the MD5 file doesn't match,
- # print to the log.
- } elsif (! -f $fn) { logger('gone', $fn); }
- elsif ($md5h{$fn} ne $hash)
- { logger('diff', $md5fn); }
- }
- }
- }
- }
- sub md5sum {
- while ($busy) { yield(); }
- if ($large{$fn}) {
- lock($busy);
- $busy = 1;
- my $hash = Digest::MD5->new->addfile("FILE")->hexdigest;
- $busy = 0;
- } else {
- my $hash = md5_hex($file_contents{$fn});
- lock($file_stack);
- lock(%file_contents);
- }
- }
- # Subroutine to index the files
- # i.e calculate and store the MD5 sums in the database hash/file.
- # It takes 1 argument:
- # (1) the thread queue
- sub md5index {
- my $tid = threads->tid();
- # Loop through the thread que.
- LOOP2: while ((my $fn = $q->dequeue_nb()) || !$stopping) {
- if ($fn) {
- $md5h{$fn} = md5sum($fn);
- say "$tid $fn: done indexing (${file_stack})";
- lock($n);
- $n++;
- } else {
- yield();
- }
- # If the $saw_sigint variable has been tripped.
- # Quit this 'while' loop, thereby closing the thread.
- if ($saw_sigint == 1) {
- say "Closing thread: " . $tid;
- last;
- }
- }
- while (!$q->pending() && !$stopping) {
- yield();
- }
- }
- # Subroutine for testing to see if the MD5 sums
- # in the database file are correct (i.e. have changed or not).
- # It takes 1 argument:
- # (1) the thread queue
- sub md5test {
- my $tid = threads->tid();
- my ($oldmd5, $newmd5);
- # Loop through the thread queue.
- LOOP: while ((my $fn = $q->dequeue_nb()) || !$stopping) {
- if ($fn) {
- $newmd5 = md5sum($fn);
- say "$tid $fn: done testing (${file_stack})";
- $oldmd5 = $md5h{$fn};
- # If the new MD5 sum doesn't match the one in the hash,
- # and file doesn't already exist in the %err hash,
- # log it and replace the old MD5 sum in the hash with
- # the new one.
- if ($newmd5 ne $oldmd5 && ! $err{$fn}) {
- logger('diff', $fn);
- $md5h{$fn} = $newmd5;
- }
- lock($n);
- $n++;
- } else {
- yield();
- }
- # If the $saw_sigint variable has been tripped.
- # Quit this 'while' loop, thereby closing the thread.
- if ($saw_sigint == 1) {
- say "Closing thread: " . $tid;
- last;
- }
- }
- while (!$q->pending() && !$stopping) {
- yield();
- }
- }
- sub md5flac {
- my (@req, $hash);
- if ($fn =~ /.flac$/i) {
- if (! @req) {
- if (! $req[0] || ! $req[1]) {
- say "You need both 'flac' and 'metaflac' to test FLAC files!\n" .
- "Using normal test method...\n";
- @req = '0';
- }
- }
- unless ($req[0] = '0') {
- }
- }
- }
- # Create the thread queue.
- my $q = Thread::Queue->new();
- # Depending on which script mode is active,
- # set the @run array to the correct arguments.
- # This will be used to start the threads later.
- my @run;
- given ($mode) {
- when ('index') {
- @run = (\&md5index, $q);
- }
- when ('test') {
- @run = (\&md5test, $q);
- }
- }
- # If script mode is either 'import' or 'double' we'll start only
- # one thread, else we'll start as many as the available number of CPUs.
- my @threads;
- if ($mode ne 'import' && $mode ne 'double') {
- foreach (1 .. $cores) {
- }
- }
- # This loop is where the actual action takes place
- # (i.e. where all the subroutines get called from)
- foreach my $dn (@lib) {
- if (-d $dn) {
- # Changing $dn to the absolute path.
- $dn = abs_path($dn);
- # Adding the current PATH to the $db variable.
- $db = "$dn/$db";
- # Change directory to $dn.
- # Start logging.
- logger('start');
- # If the database file is a real file,
- # store it in the database hash.
- file2hash() if (-f $db);
- given ($mode) {
- when ('double') {
- # Find identical files in database.
- md5double();
- }
- when ('import') {
- # For all the files in $dn, run md5import.
- foreach my $fn (getfiles($dn)) { md5import($fn); }
- }
- when ('index') {
- #say("Enqueueing files");
- # Get all the file names and put them in the queue.
- foreach my $fn (getfiles($dn)) {
- while ($file_stack >= $disk_size) {
- my $active = threads->running();
- say("${active}: $file_stack > $disk_size");
- yield();
- }
- # Unless file name exists in the database hash,
- # continue.
- unless ($md5h{$fn}) {
- if ($size < $disk_size) {
- #say("${fn}: reading");
- #say("${fn}: queueing");
- lock($file_stack);
- $q->enqueue($fn);
- } else {
- $large{$fn} = 1;
- }
- }
- }
- }
- when ('test') {
- #say("Enqueueing files");
- # Fetch all the keys for the database hash and put
- # them in the queue.
- while ($file_stack >= $disk_size) {
- say("$file_stack > $disk_size");
- yield();
- }
- if ($size < $disk_size) {
- #say("${fn}: reading");
- #say("${fn}: queueing");
- lock($file_stack);
- $q->enqueue($fn);
- } else {
- $large{$fn} = 1;
- }
- }
- }
- }
- if (%large) {
- while ($file_stack > 0) {
- say("$file_stack > $disk_size");
- yield();
- }
- #say("${fn}: queueing");
- $q->enqueue($fn);
- }
- }
- # use Digest::MD5;
- # $md5 = Digest::MD5->new;
- # $md5->add('foo', 'bar');
- # $md5->add('baz');
- # $digest = $md5->hexdigest;
- # print "Digest is $digest\n";
- $stopping = 1;
- #say("All threads joined");
- # Print the hash to the database file and close the log
- hash2file();
- logger('end', $n);
- }
- }
md5db script (threaded, ram-only version) with flac and large file support
Posted by Anonymous on Sat 22nd Sep 2012 13:05
raw | new post
view followups (newest first): md5db script (threaded, ram-only version) with flac and large file support by Anonymous
modification of post by Anonymous (view diff)
Submit a correction or amendment below (click here to make a fresh posting)
After submitting an amendment, you'll be able to view the differences between the old and new posts easily.