md5db script (threaded, ram-only version) with flac and large file support
Posted by Anonymous on Sat 22nd Sep 2012 13:05
raw | new post
view followups (newest first): md5db script (threaded, ram-only version) with flac and large file support by Anonymous
modification of post by Anonymous (view diff)

#!/usr/bin/perl

use 5.12.3;
use strict;
use warnings;
use Cwd qw(abs_path cwd);
use Digest::MD5 qw(md5_hex);
use IO::Handle qw(autoflush);
use File::Basename qw(basename dirname);
#use File::Slurp qw(read_file);
use diagnostics;

use threads qw(yield);
use threads::shared;
use Thread::Queue;
use Thread::Semaphore;
#use Fcntl qw(:flock);
use POSIX qw(SIGINT);
use POSIX qw(ceil);

chomp(my $cores = `grep -c ^processor /proc/cpuinfo`);

my (@lib, $mode);

# Path to and name of log file to be used for logging.
my $logf = "$ENV{HOME}/md5db.log";

# Delimiter used for database
my $delim = "\t\*\t";

# Array for storing the actual arguments used by the script internally.
# Might be useful for debugging.
my @cmd = (basename($0));

# Name of database file.
my $db = 'md5.db';

# Clear screen command.
my $clear = `clear && echo`;

# Creating a hash that will store the names of files that are
# too big to fit into RAM. We'll process them last.
my %large :shared;

# Creating a few shared variables.
# %err will be used for errors
# $n will be used to count the number of files processed
# %md5h is the database hash
my %err :shared;
my $n :shared = 0;
my %md5h :shared;
my %file_contents :shared;
my $stopping :shared = 0;
my $file_stack :shared = 0;
my $busy :shared = 0;

my $disk_size = 1000000000;

# This will be used to control access to the logger subroutine.
my $semaphore = Thread::Semaphore->new();

POSIX::sigaction(SIGINT, POSIX::SigAction->new(\&handler))
|| die "Error setting SIGINT handler: $!\n";

# Creating a custom POSIX signal handler.
# First we create a shared variable that will work as a SIGINT switch.
# Then we define the handler subroutine.
# If ^C is pressed for the first time,
# trip the switch and let the current threads finish.
# If ^C is pressed a second time, quit directly.
# Each subroutine to be used for starting threads will have to
# take notice of the state of the $saw_sigint variable.
my $saw_sigint :shared = 0;
sub handler {
        if ($saw_sigint == 1) {
                logger('int', $n);
                hash2file();
                exit;
        } else { $saw_sigint = 1; }
}

# Open file handle for the log file
open(my $LOG, '>>', $logf) or die "Can't open '$logf': $!";

# Make the $LOG file handle unbuffered for instant logging.
$LOG->autoflush(1);

# Duplicate STDERR as a regular file handle
open(my $SE, ">&STDERR") or die "Can't duplicate STDERR: $!";

### Subroutine for printing usage instructions
sub usage {

        my $s = basename($0);

        say <<"HELP"
Usage: $s [options] [directory 1] .. [directory N]

        -help Print this help message.

        -double Check database for files that have identical
        hashes.

        -import Import MD5 sums to the database from already existing
        \*.MD5 files in each directory.

        -index Index new files in each directory.

        -test Test the MD5 sums of the files in the database to see if
        they've changed.

HELP
}

# This loop goes through the argument list as passed to the script
# by the user when ran.
foreach my $arg (@ARGV) {

        # If argument starts with a dash '-', interprete it as an option
        if ($arg =~ /^-/) {

                given ($arg) {
                        # When '-double', set script mode to 'double', and call
                        # the md5double subroutine later.
                        when (/^-double$/) {
                                if (!$mode) { push(@cmd, $arg); $mode = 'double'; }
                        }

                        # When '-import', set script mode to 'import', and call
                        # the md5import subroutine later.
                        when (/^-import$/) {
                                if (!$mode) { push(@cmd, $arg); $mode = 'import'; }
                        }

                        # When '-help', set script mode to 'help', and print
                        # usage instructions later.
                        when (/^-help$/) {
                                if (!$mode) { push(@cmd, $arg); $mode = 'help'; }
                        }

                        # When '-index', set script mode to 'index', and call
                        # the md5index subroutine later.
                        when (/^-index$/) {
                                if (!$mode) { push(@cmd, $arg); $mode = 'index'; }
                        }

                        # When '-test', set the script mode to 'test', and call
                        # the md5test subroutine later.
                        when (/^-test$/) {
                                if (!$mode) { push(@cmd, $arg); $mode = 'test'; }
                        }
                }
        # If argument is a directory, include it in the @lib array
        } elsif (-d $arg) { push(@lib, $arg); push(@cmd, $arg); }
}

# If no switches were used, print usage instructions
if (!@lib || !$mode || $mode eq 'help')
        { usage; exit; }

#say "@cmd\n";

# Subroutine for controlling the log file
# Applying a semaphore so multiple threads won't try to
# access it at once, just in case ;-)
sub logger {

        $semaphore->down();

        my($arg, $sw, @fn, $n);

        # Creating a variable to hold the current time.
        my $now = localtime(time);

        # Array of accepted switches to this subroutine
        my @larg = qw{start int gone corr diff end};

        # Loop through all the arguments passed to this subroutine
        # Perform checks that decide which variable the arguments are to
        # be assigned to.
        CHECK: while (@_) {
                        $arg = shift(@_);

                        # If $arg is a switch, set the $sw variable and start
                        # the next iteration of the CHECK loop.
                        foreach (@larg) {
                                if ($_ eq $arg) { $sw = $arg; next CHECK; }
                        }

                        # If $arg is a number assign it to $n, if it's a file
                        # add it to @fn.
                        if ($arg =~ /^[0-9]*$/) { $n = $arg; }
                        else { push(@fn, $arg); }
        }
        given ($sw) {
                # Starts writing the log.
                when ('start') {
                        say $LOG "\n**** Logging started on $now ****\n";
                        say $LOG "Running script in '$mode' mode.\n";
                }
                # When the script is interrupted by user pressing ^C,
                # say so in STDOUT, close the log.
                when ('int') {
                        say "\nInterrupted by user!\n";
                        say $LOG $n . " file(s) were tested.";
                        say $LOG "\n**** Logging ended on $now ****\n";
                        close $LOG or die "Can't close '$LOG': $!";
                }
                # Called when file has been deleted or moved.
                when ('gone') {
                        say $LOG $fn[0] . "\n\t" . "has been (re)moved.\n";
                        $err{$fn[0]} = "has been (re)moved.\n";
                }
                # Called when file has been corrupted.
                when ('corr') {
                        say $LOG $fn[0] . "\n\t" .
                        "has been corrupted.\n";
                        $err{$fn[0]} = "has been corrupted.\n";
                }
                when ('diff') {
                        say $LOG $fn[0] . "\n\t" .
                                "doesn't match the hash in database.\n";
                        $err{$fn[0]} = "doesn't match the hash in database.\n";
                }
                # Called when done, and to close the log.
                # If no errors occurred write "Everything is OK!" to the log.
                # If errors occurred print the %err hash.
                # Either way, print number of files processed.
                when ('end') {
                        if (!%err) {
                                say $LOG "\nEverything is OK!\n";
                        } else {
                                say "\n**** Errors Occurred ****\n";
                                foreach my $fn (sort keys %err) {
                                        say $SE $fn . "\n\t" . $err{$fn};
                                }
                        }

                        say $LOG $n . " file(s) were tested.\n" if ($n);
                        say $LOG "\n**** Logging ended on $now ****\n";
                        close $LOG or die "Can't close '$LOG': $!";
                }
        }
        $semaphore->up();
}


# Subroutine for reading a database file into the database hash.
# This is the first subroutine that will be executed and all others
# depend upon it, cause without it we don't have a
# database hash to work with.
sub file2hash {

        # The format string which is used for parsing the database file
        my $format = qr/^.*\t\*\t[[:alnum:]]{32}$/;
        my (@dbfile, @gone);

        # Open the database file and read it into the @dbfile variable
        open(MD5DB, '<', $db) or die "Can't open '$db': $!";
        chomp (@dbfile = (<MD5DB>));
        close(MD5DB) or die "Can't close '$db': $!";

        # Loop through all the lines in the database file and split
        # them before storing in the database hash.
        # Also, print each line to STDOUT for debug purposes
        foreach my $line (@dbfile) {

                # If current line matches the proper database file format,
                # continue.
                if ($line =~ /$format/) {

                        my ($fn, $hash) = (split(/\Q$delim/, $line));

                        # If $fn is a real file and not already in the hash,
                        # continue.
                        if (-f $fn && ! $md5h{$fn}) {
                                        $md5h{$fn} = $hash;
                                        say "$fn". "$delim" . "$hash";

                        # Saves the names of deleted or moved files in '@gone'
                        # for printing at the end of this subroutine.
                        } else { push(@gone, $fn); }
                }
        }

        # Clears the screen, thereby scrolling past the database file print
        print $clear;

        # Loops through the @gone array and logs every file name
        # that's been deleted or moved.
        foreach my $fn (@gone) { logger('gone', $fn); }
}

# Subroutine for printing the database hash to the database file
sub hash2file {

        open(MD5DB, '>', "$db") or die "Can't open '$db': $!";
        # Loops through all the keys in the database hash and prints
        # the entries (divided by the $delim variable) to the database file.
        foreach my $k (sort keys %md5h) {
                say MD5DB $k . $delim . $md5h{$k};
        }
        close(MD5DB) or die "Can't close '$db': $!";
}

# Subroutine for finding files
# Finds all the files inside the directory name passed to it,
# and sorts the output before storing it in the @files array.
sub getfiles {

        my $dn = shift;
        my @files;

        open(FIND, '-|', qq(find -L "$dn" -type f -name "*"))
        or die "Can\'t run 'find': $!";
        while (my $fn = (<FIND>)) {
                        chomp($fn);
                        $fn =~ s($dn/)();
                        push(@files, $fn) if (-f $fn && $fn ne basename($db));
        }
        close(FIND) or die "Can't close 'find': $!";
        return @files;
}

sub md5double {

        if (!keys(%md5h)) {
                say "No database file. Run the script in 'index' mode first\n" .
                "to index the files.";
                exit;
        }

        # Loop through the %md5h hash and save the checksums as keys in a
        # new hash called %exists. Each of those keys will hold an
        # anonymous array with the matching file names.
        my %exists;
        foreach my $fn (keys(%md5h)) {
                my $hash = $md5h{$fn};
                if (!$exists{"$hash"}) {
                        $exists{"$hash"}->[0] = $fn;
                } else {
                        push(@{$exists{"$hash"}}, $fn);
                }
        }

        # Loop through the %exists hash and print files that are identical,
        # if any.
        foreach my $hash (keys(%exists)) {
                if (scalar(@{$exists{"$hash"}}) > 1) {
                        say "These files have the same hash (${hash}):";
                        foreach my $fn (@{$exists{"$hash"}}) {
                                say $fn;
                        }
                        say "";
                }
        }
}

# Subroutine for finding and parsing *.MD5 files, adding the hashes
# to the database hash and thereby also to the file.
# It takes 1 argument:
# (1) the thread queue
sub md5import {

        my $md5fn = shift;

        my ($fn, $hash, @fields, @lines);

        # The format string which is used for parsing the *.MD5 files.
        my $format = qr/^[[:alnum:]]{32}\s\*.*/;

        # If the file extension is *.MD5 in either upper- or
        # lowercase, continue.
        if ($md5fn =~ /.md5$/i) {

                # Open the *.MD5 file and read its contents to the
                # @lines array.
                open(MD5, '<', $md5fn) or die "Can't open '$md5fn': $!";
                chomp(@lines = (<MD5>));
                close(MD5) or die "Can't close '$md5fn': $!";

                # Loop to check that the format of the *.MD5 file really
                # is correct before proceeding.
                foreach my $line (@lines) {

                        # If format string matches the line(s) in the *.MD5
                        # file, continue.
                        if ($line =~ /$format/) {

                                # Split the line so that the hash and file name go
                                # into @fields array.
                                # After that strip the path (if any) of the file
                                # name, and prepend the path of the *.MD5 file to
                                # it instead.
                                # Store hash and file name in the $hash and $fn
                                # variables for readability.
                                @fields = split(/\s\Q*/, $line, 2);
                                my $path = dirname($md5fn);
                                $hash = $fields[0];

                                if ($path eq '.') { $fn = basename($fields[1]); }
                                else { $fn = dirname($md5fn)
                                . '/' . basename($fields[1]); }

                                # Convert CR+LF newlines to proper LF to avoid
                                # identical file names from being interpreted as
                                # different.
                                $fn =~ s/\r//;

                                # Unless file name already is in the database hash,
                                # print a message, add it to the hash.
                                if (! $md5h{$fn} && -f $fn) {

                                        say "$fn" . "\n\t" .
                                        "Imported MD5 sum from '" .
                                        basename($md5fn) .
                                        "'.\n";

                                        $md5h{$fn} = $hash;

                                        # If file name is not a real file, write to
                                        # the log.
                                        # If file name is in database hash but the
                                        # MD5 sum from the MD5 file doesn't match,
                                        # print to the log.
                                } elsif (! -f $fn) { logger('gone', $fn); }
                                elsif ($md5h{$fn} ne $hash)
                                { logger('diff', $md5fn); }
                        }
                }
        }
}


sub md5sum {
        my $fn = shift;

        while ($busy) { yield(); }

        if ($large{$fn}) {
                lock($busy);
                $busy = 1;
                open(FILE, '<:raw', $fn) or die "Can't open '$fn': $!";
                my $hash = Digest::MD5->new->addfile("FILE")->hexdigest;
                close(FILE) or die "Can't close '$fn': $!";
                $busy = 0;
                return $hash;
        } else {
                my $hash = md5_hex($file_contents{$fn});
                lock($file_stack);
                $file_stack -= length($file_contents{$fn});
                lock(%file_contents);
                delete($file_contents{$fn});
                return $hash;
        }
}

# Subroutine to index the files
# i.e calculate and store the MD5 sums in the database hash/file.
# It takes 1 argument:
# (1) the thread queue
sub md5index {
        my $q = shift;
        my $tid = threads->tid();

        # Loop through the thread que.
        LOOP2: while ((my $fn = $q->dequeue_nb()) || !$stopping) {
                        if ($fn) {
                                $md5h{$fn} = md5sum($fn);
                                say "$tid $fn: done indexing (${file_stack})";
                                lock($n);
                                $n++;
                        } else {
                                yield();
                        }

                        # If the $saw_sigint variable has been tripped.
                        # Quit this 'while' loop, thereby closing the thread.
                        if ($saw_sigint == 1) {
                                say "Closing thread: " . $tid;
                                last;
                        }
        }

        while (!$q->pending() && !$stopping) {
                yield();
                goto(LOOP2);
        }
}

# Subroutine for testing to see if the MD5 sums
# in the database file are correct (i.e. have changed or not).
# It takes 1 argument:
# (1) the thread queue
sub md5test {
        my $q = shift;
        my $tid = threads->tid();
        my ($oldmd5, $newmd5);

        # Loop through the thread queue.
        LOOP: while ((my $fn = $q->dequeue_nb()) || !$stopping) {
                                if ($fn) {
                                        $newmd5 = md5sum($fn);
                                        say "$tid $fn: done testing (${file_stack})";
                                        $oldmd5 = $md5h{$fn};

                                        # If the new MD5 sum doesn't match the one in the hash,
                                        # and file doesn't already exist in the %err hash,
                                        # log it and replace the old MD5 sum in the hash with
                                        # the new one.
                                        if ($newmd5 ne $oldmd5 && ! $err{$fn}) {
                                                logger('diff', $fn);
                                                $md5h{$fn} = $newmd5;
                                        }
                                        lock($n);
                                        $n++;
                                } else {
                                        yield();
                                }
                                # If the $saw_sigint variable has been tripped.
                                # Quit this 'while' loop, thereby closing the thread.
                                if ($saw_sigint == 1) {
                                        say "Closing thread: " . $tid;
                                        last;
                                }
        }

        while (!$q->pending() && !$stopping) {
                yield();
                goto(LOOP);
        }
}

sub md5flac {
        my $fn = shift;
        my (@req, $hash);

        if ($fn =~ /.flac$/i) {

                if (! @req) {
                        chomp(@req = ( `which flac metaflac 2>&-` ));

                        if (! $req[0] || ! $req[1]) {
                                say "You need both 'flac' and 'metaflac' to test FLAC files!\n" .
                                "Using normal test method...\n";
                                @req = '0';
                                return;
                        }
                }

                unless ($req[0] = '0') {
                        chomp($hash = `metaflac --show-md5sum "$fn" 2>&-`);
                        if ($? != 0 && $? != 2) { logger('corr', $fn); return; }

                        system('flac', '--totally-silent', '--test', "$fn");
                        if ($? != 0 && $? != 2) { logger('corr', $fn); return; }

                        return $hash;
                }
        }
}

# Create the thread queue.
my $q = Thread::Queue->new();

# Depending on which script mode is active,
# set the @run array to the correct arguments.
# This will be used to start the threads later.
my @run;
given ($mode) {
        when ('index') {
                @run = (\&md5index, $q);
        }
        when ('test') {
                @run = (\&md5test, $q);
        }
}

# If script mode is either 'import' or 'double' we'll start only
# one thread, else we'll start as many as the available number of CPUs.
my @threads;
if ($mode ne 'import' && $mode ne 'double') {
        foreach (1 .. $cores) {
                push(@threads, threads->create(@run));
        }
}

# This loop is where the actual action takes place
# (i.e. where all the subroutines get called from)
foreach my $dn (@lib) {
        if (-d $dn) {
                # Changing $dn to the absolute path.
                $dn = abs_path($dn);

                # Adding the current PATH to the $db variable.
                $db = "$dn/$db";

                # Change directory to $dn.
                chdir($dn)
                        or die "Can't change directory to '$dn': $!";

                # Start logging.
                logger('start');

                # If the database file is a real file,
                # store it in the database hash.
                file2hash() if (-f $db);

                given ($mode) {
                        when ('double') {
                                # Find identical files in database.
                                md5double();
                        }
                        when ('import') {
                                # For all the files in $dn, run md5import.
                                foreach my $fn (getfiles($dn)) { md5import($fn); }
                        }
                        when ('index') {
                                #say("Enqueueing files");
                                # Get all the file names and put them in the queue.
                                foreach my $fn (getfiles($dn)) {
                                        while ($file_stack >= $disk_size) {
                                                my $active = threads->running();
                                                say("${active}: $file_stack > $disk_size");
                                                yield();
                                        }
                                        # Unless file name exists in the database hash,
                                        # continue.
                                        unless ($md5h{$fn}) {
                                                my $size = (stat($fn))[7];
                                                if ($size < $disk_size) {

                                                        #say("${fn}: reading");
                                                        open(FILE, '<:raw', $fn) or die "Can't open '$fn': $!";
                                                        sysread(FILE, $file_contents{$fn}, $size);
                                                        close(FILE) or die "Can't close '$fn': $!";

                                                        #say("${fn}: queueing");
                                                        lock($file_stack);
                                                        $file_stack += length($file_contents{$fn});
                                                        $q->enqueue($fn);
                                                } else {
                                                        $large{$fn} = 1;
                                                }
                                        }
                                }
                        }
                        when ('test') {
                                #say("Enqueueing files");
                                # Fetch all the keys for the database hash and put
                                # them in the queue.
                                foreach my $fn (sort(keys(%md5h))) {
                                        while ($file_stack >= $disk_size) {
                                                say("$file_stack > $disk_size");
                                                yield();
                                        }

                                        my $size = (stat($fn))[7];
                                        if ($size < $disk_size) {

                                                #say("${fn}: reading");
                                                open(FILE, '<:raw', $fn) or die "Can't open '$fn': $!";
                                                sysread(FILE, $file_contents{$fn}, $size);
                                                close(FILE) or die "Can't close '$fn': $!";

                                                #say("${fn}: queueing");
                                                lock($file_stack);
                                                $file_stack += length($file_contents{$fn});
                                                $q->enqueue($fn);
                                        } else {
                                                $large{$fn} = 1;
                                        }
                                }
                        }
                }

                if (%large) {
                        while ($file_stack > 0) {
                                say("$file_stack > $disk_size");
                                yield();
                        }
                        foreach my $fn (sort(keys(%large))) {
                                #say("${fn}: queueing");
                                $q->enqueue($fn);
                        }
                }

#     use Digest::MD5;
#    $md5 = Digest::MD5->new;
#    $md5->add('foo', 'bar');
#    $md5->add('baz');
#    $digest = $md5->hexdigest;
#    print "Digest is $digest\n";


                $stopping = 1;
                foreach my $t (threads->list()) { $t->join(); }
                #say("All threads joined");

                # Print the hash to the database file and close the log
                hash2file();
                logger('end', $n);
        }
}