pastebin - collaborative debugging tool
kpaste.net RSS

Difference between
modified post 765b45d6 by Anonymous on Fri 21st Sep 2012 07:24
original post fa3 by Anonymous on Fri 21st Sep 2012 04:49
Download diff
Show old version | new version | both versions

    
11
#!/usr/bin/perl
33
use 5.12.3;
44
use strict;
55
use warnings;
66
use Cwd qw(abs_path cwd);
77
use Digest::MD5 qw(md5_hex);
88
use IO::Handle qw(autoflush);
99
use File::Basename qw(basename dirname);
1010
#use File::Slurp qw(read_file);
1111
use diagnostics;
1313
use threads qw(yield);
1414
use threads::shared;
1515
use Thread::Queue;
1616
use Thread::Semaphore;
1717
#use Fcntl qw(:flock);
1818
use POSIX qw(SIGINT);
1919
use POSIX qw(ceil);
2121
chomp(my $cores = `grep -c ^processor /proc/cpuinfo`);
2323
my (@lib, $mode);
2525
# Path to and name of log file to be used for logging.
2626
my $logf = "$ENV{HOME}/md5db.log";
2828
# Delimiter used for database
2929
my $delim = "\t\*\t";
3131
# Array for storing the actual arguments used by the script internally.
3232
# Might be useful for debugging.
3333
my @cmd = (basename($0));
3535
# Name of database file.
3636
my $db = 'md5.db';
3838
# Clear screen command.
3939
my $clear = `clear && echo`;
4141
# Creating a hash that will store the names of files that are
4242
# too big to fit into RAM. We'll process them last.
4343
my %large;
4545
# Creating a few shared variables.
4646
# %err will be used for errors
4747
# $n will be used to count the number of files processed
4848
# %md5h is the database hash
4949
my %err :shared;
5050
my $n :shared = 0;
5151
my %md5h :shared;
5252
my %file_contents :shared;
5353
my $stopping :shared = 0;
5454
my $file_stack :shared = 0;
5555
my $busy :shared = 0;
5656
my @parts :shared = (0) x 2;
5858
my $disk_size = 500000000;
6060
# This will be used to control access to the logger subroutine.
6161
my $semaphore = Thread::Semaphore->new();
6363
POSIX::sigaction(SIGINT, POSIX::SigAction->new(\&handler))
6464
|| die "Error setting SIGINT handler: $!\n";
6666
# Creating a custom POSIX signal handler.
6767
# First we create a shared variable that will work as a SIGINT switch.
6868
# Then we define the handler subroutine.
6969
# If ^C is pressed for the first time,
7070
# trip the switch and let the current threads finish.
7171
# If ^C is pressed a second time, quit directly.
7272
# Each subroutine to be used for starting threads will have to
7373
# take notice of the state of the $saw_sigint variable.
7474
my $saw_sigint :shared = 0;
7575
sub handler {
7676
	if ($saw_sigint == 1) {
7777
		logger('int', $n);
7878
		hash2file();
7979
		exit;
8080
	} else { $saw_sigint = 1; }
8181
}
8383
# Open file handle for the log file
8484
open(my $LOG, '>>', $logf) or die "Can't open '$logf': $!";
8686
# Make the $LOG file handle unbuffered for instant logging.
8787
$LOG->autoflush(1);
8989
# Duplicate STDERR as a regular file handle
9090
open(my $SE, ">&STDERR") or die "Can't duplicate STDERR: $!";
9292
### Subroutine for printing usage instructions
9393
sub usage {
9595
	my $s = basename($0);
9797
	say <<"HELP"
9898
Usage: $s [options] [directory 1] .. [directory N]
100100
	-help Print this help message.
102102
	-import Import MD5 sums to the database from already existing
103103
	\*.MD5 files in each directory.
105105
	-index Index new files in each directory.
107107
	-test Test the MD5 sums of the files in the database to see if
108108
	they've changed.
110110
HELP
111111
}
113113
# This loop goes through the argument list as passed to the script
114114
# by the user when ran.
115115
foreach my $arg (@ARGV) {
117117
	# If argument starts with a dash '-', interprete it as an option
118118
	if ($arg =~ /^-/) {
120120
		given ($arg) {
122122
			# When '-import', set script mode to 'import', and call
123123
			# the md5import subroutine later.
124124
			when (/^-import$/) {
125125
				if (!$mode) { push(@cmd, $arg); $mode = 'import'; }
126126
			}
128128
			# When '-help', set script mode to 'help', and print
129129
			# usage instructions later.
130130
			when (/^-help$/) {
131131
				if (!$mode) { push(@cmd, $arg); $mode = 'help'; }
132132
			}
134134
			# When '-index', set script mode to 'index', and call
135135
			# the md5index subroutine later.
136136
			when (/^-index$/) {
137137
				if (!$mode) { push(@cmd, $arg); $mode = 'index'; }
138138
			}
140140
			# When '-test', set the script mode to 'test', and call
141141
			# the md5test subroutine later.
142142
			when (/^-test$/) {
143143
				if (!$mode) { push(@cmd, $arg); $mode = 'test'; }
144144
			}
145145
		}
146146
	# If argument is a directory, include it in the @lib array
147147
	} elsif (-d $arg) { push(@lib, $arg); push(@cmd, $arg); }
148148
}
150150
# If no switches were used, print usage instructions
151151
if (!@lib ||
152152
	!$mode ||
153153
	$mode eq 'help')
154154
	{ usage; exit; }
156156
#say "@cmd\n";
158158
# Subroutine for controlling the log file
159159
# Applying a semaphore so multiple threads won't try to
160160
# access it at once, just in case ;-)
161161
sub logger {
163163
	$semaphore->down();
165165
	my($arg, $sw, @fn, $n);
167167
	# Creating a variable to hold the current time.
168168
	my $now = localtime(time);
170170
	# Array of accepted switches to this subroutine
171171
	my @larg = qw{start int gone corr diff end};
173173
	# Loop through all the arguments passed to this subroutine
174174
	# Perform checks that decide which variable the arguments are to
175175
	# be assigned to.
176176
	CHECK: while (@_) {
177177
			$arg = shift(@_);
179179
			# If $arg is a switch, set the $sw variable and start
180180
			# the next iteration of the CHECK loop.
181181
			foreach (@larg) {
182182
				if ($_ eq $arg) { $sw = $arg; next CHECK; }
183183
			}
185185
			# If $arg is a number assign it to $n, if it's a file
186186
			# add it to @fn.
187187
			if ($arg =~ /^[0-9]*$/) { $n = $arg; }
188188
			else { push(@fn, $arg); }
189189
	}
190190
	given ($sw) {
191191
		# Starts writing the log.
192192
		when ('start') {
193193
			say $LOG "\n**** Logging started on $now ****\n";
194194
			say $LOG "Running script in '$mode' mode.\n";
195195
		}
196196
		# When the script is interrupted by user pressing ^C,
197197
		# say so in STDOUT, close the log.
198198
		when ('int') {
199199
			say "\nInterrupted by user!\n";
200200
			say $LOG $n . " file(s) were tested.";
201201
			say $LOG "\n**** Logging ended on $now ****\n";
202202
			close $LOG or die "Can't close '$LOG': $!";
203203
		}
204204
		# Called when file has been deleted or moved.
205205
		when ('gone') {
206206
			say $LOG $fn[0] . "\n\t" . "has been (re)moved.\n";
207207
			$err{$fn[0]} = "has been (re)moved.\n";
208208
		}
209209
		# Called when file has been corrupted.
210210
		when ('corr') {
211211
			say $LOG $fn[0] . "\n\t" .
212212
			"has been corrupted.\n";
213213
			$err{$fn[0]} = "has been corrupted.\n";
214214
		}
215215
		when ('diff') {
216216
			say $LOG $fn[0] . "\n\t" .
217217
				"doesn't match the hash in database.\n";
218218
			$err{$fn[0]} = "doesn't match the hash in database.\n";
219219
		}
220220
		# Called when done, and to close the log.
221221
		# If no errors occurred write "Everything is OK!" to the log.
222222
		# If errors occurred print the %err hash.
223223
		# Either way, print number of files processed.
224224
		when ('end') {
225225
			if (!%err) {
226226
				say $LOG "\nEverything is OK!\n";
227227
			} else {
228228
				say "\n**** Errors Occurred ****\n";
229229
				foreach my $fn (sort keys %err) {
230230
					say $SE $fn . "\n\t" . $err{$fn};
231231
				}
232232
			}
234234
			say $LOG $n . " file(s) were tested.\n" if ($n);
235235
			say $LOG "\n**** Logging ended on $now ****\n";
236236
			close $LOG or die "Can't close '$LOG': $!";
237237
		}
238238
	}
239239
	$semaphore->up();
240240
}
243243
# Subroutine for reading a database file into the database hash.
244244
# This is the first subroutine that will be executed and all others
245245
# depend upon it, cause without it we don't have a
246246
# database hash to work with.
247247
sub file2hash {
249249
	# The format string which is used for parsing the database file
250-
	my $format = '^.*\t\*\t[[:alnum:]]{32}$';
250+
	my $format = qr/^.*\t\*\t[[:alnum:]]{32}$/;
251251
	my (@dbfile, @gone);
253253
	# Open the database file and read it into the @dbfile variable
254254
	open(MD5DB, '<', $db) or die "Can't open '$db': $!";
255255
	chomp (@dbfile = (<MD5DB>));
256256
	close(MD5DB) or die "Can't close '$db': $!";
258258
	# Loop through all the lines in the database file and split
259259
	# them before storing in the database hash.
260260
	# Also, print each line to STDOUT for debug purposes
261261
	foreach my $line (@dbfile) {
263263
		# If current line matches the proper database file format,
264264
		# continue.
265265
		if ($line =~ /$format/) {
267267
			my ($fn, $hash) = (split(/\Q$delim/, $line));
269269
			# If $fn is a real file and not already in the hash,
270270
			# continue.
271271
			if (-f $fn && ! $md5h{$fn}) {
272272
					$md5h{$fn} = $hash;
273273
					say "$fn". "$delim" . "$hash";
275275
			# Saves the names of deleted or moved files in '@gone'
276276
			# for printing at the end of this subroutine.
277277
			} else { push(@gone, $fn); }
278278
		}
279279
	}
281281
	# Clears the screen, thereby scrolling past the database file print
282282
	print $clear;
284284
	# Loops through the @gone array and logs every file name
285285
	# that's been deleted or moved.
286286
	foreach my $fn (@gone) { logger('gone', $fn); }
287287
}
289289
# Subroutine for printing the database hash to the database file
290290
sub hash2file {
292292
	open(MD5DB, '>', "$db") or die "Can't open '$db': $!";
293293
	# Loops through all the keys in the database hash and prints
294294
	# the entries (divided by the $delim variable) to the database file.
295295
	foreach my $k (sort keys %md5h) {
296296
		say MD5DB $k . $delim . $md5h{$k};
297297
	}
298298
	close(MD5DB) or die "Can't close '$db': $!";
299299
}
301301
# Subroutine for finding files
302302
# Finds all the files inside the directory name passed to it,
303303
# and sorts the output before storing it in the @files array.
304304
sub getfiles {
306306
	my $dn = shift;
307307
	my @files;
309309
	open(FIND, '-|', qq(find -L "$dn" -type f -name "*"))
310310
	or die "Can\'t run 'find': $!";
311311
	while (my $fn = (<FIND>)) {
312312
			chomp($fn);
313313
			$fn =~ s($dn/)();
314314
			push(@files, $fn) if (-f $fn && $fn ne basename($db));
315315
	}
316316
	close(FIND) or die "Can't close 'find': $!";
317317
	return @files;
318318
}
320320
# Subroutine for finding and parsing *.MD5 files, adding the hashes
321321
# to the database hash and thereby also to the file.
322322
# It takes 1 argument:
323323
# (1) the thread queue
324324
sub md5import {
325325
	my $q = shift;
327327
	my ($fn, $hash, @fields, @lines);
329329
	# The format string which is used for parsing the *.MD5 files.
330330
	my $format = '^[[:alnum:]]{32}\s\*.*';
332332
	# Loop through the @files array.
333333
	while (my $md5fn = $q->dequeue_nb()) {
335335
		# If the file extension is *.MD5 in either upper- or
336336
		# lowercase, continue.
337337
		if ($md5fn =~ /.md5$/i) {
339339
			# Open the *.MD5 file and read its contents to the
340340
			# @lines array.
341341
			open(MD5, '<', $md5fn) or die "Can't open '$md5fn': $!";
342342
			chomp(@lines = (<MD5>));
343343
			close(MD5) or die "Can't close '$md5fn': $!";
345345
			# Loop to check that the format of the *.MD5 file really
346346
			# is correct before proceeding.
347347
			foreach my $line (@lines) {
349349
				# If format string matches the line(s) in the *.MD5
350350
				# file, continue.
351351
				if ($line =~ /$format/) {
353353
					# Split the line so that the hash and file name go
354354
					# into @fields array.
355355
					# After that strip the path (if any) of the file
356356
					# name, and prepend the path of the *.MD5 file to
357357
					# it instead.
358358
					# Store hash and file name in the $hash and $fn
359359
					# variables for readability.
360360
					@fields = split(/\s\Q*/, $line, 2);
361361
					my $path = dirname($md5fn);
362362
					$hash = $fields[0];
364364
					if ($path eq '.') { $fn = basename($fields[1]); }
365365
					else { $fn = dirname($md5fn)
366366
								. '/' . basename($fields[1]); }
368368
					# Convert CR+LF newlines to proper LF to avoid
369369
					# identical file names from being interpreted as
370370
					# different.
371371
					$fn =~ s/\r//;
373373
					# Unless file name already is in the database hash,
374374
					# print a message, add it to the hash.
375375
					if (! $md5h{$fn} && -f $fn) {
377377
						say "$fn" . "\n\t" .
378378
						"Imported MD5 sum from '" .
379379
						basename($md5fn) .
380380
						"'.\n";
382382
						$md5h{$fn} = $hash;
384384
					# If file name is not a real file, write to
385385
					# the log.
386386
					# If file name is in database hash but the
387387
					# MD5 sum from the MD5 file doesn't match,
388388
					# print to the log.
389389
					} elsif (! -f $fn) { logger('gone', $fn); }
390390
					elsif ($md5h{$fn} ne $hash)
391391
					{ logger('diff', $md5fn); }
392392
				}
393393
			}
394394
		}
395395
	}
396396
}
398398
sub md5sum {
399399
	my $fn = shift;
400400
	my $last = $parts[1];
402402
	# If $last is set it means we're dealing with a split file, so
403403
	# set $busy to 1 for the other threads to fall asleep. We need to
404404
	# stay in this thread when we're splitting files. So we create an
405405
	# 'until' loop which will process all the parts until the last one.
406406
	if ($last) {
407407
		$busy = 1;
408408
		my $md5 = Digest::MD5->new;
409409
		my $done = 0;
410410
		until ($done == $last) {
411411
			if ($file_contents{$fn}) {
412412
				$md5->add($file_contents{$fn});
413413
				lock($file_stack);
414414
				$file_stack -= length($file_contents{$fn});
415415
				lock(%file_contents);
416416
				delete($file_contents{$fn});
417417
				$done++;
418418
			}
419419
				yield();
420420
		}
421421
		my $hash = $md5->hexdigest;
422422
		$busy = 0;
423423
		return $hash;
424424
	} else {
425425
		my $hash = md5_hex($file_contents{$fn});
426426
		lock($file_stack);
427427
		$file_stack -= length($file_contents{$fn});
428428
		lock(%file_contents);
429429
		delete($file_contents{$fn});
430430
		return $hash;
431431
	}
432432
}
434434
# Subroutine to index the files
435435
# i.e calculate and store the MD5 sums in the database hash/file.
436436
# It takes 1 argument:
437437
# (1) the thread queue
438438
sub md5index {
439439
	my $q = shift;
440440
	my $tid = threads->tid();
442442
	# Loop through the thread que.
443443
	LOOP2: while ((my $fn = $q->dequeue_nb()) || !$stopping) {
444444
			if ($fn) {
445445
				$md5h{$fn} = md5sum($fn);
446446
				lock($n);
447447
				$n++;
448448
			} else {
449449
				yield();
450450
			}
452452
			# If the $saw_sigint variable has been tripped.
453453
			# Quit this 'while' loop, thereby closing the thread.
454454
			if ($saw_sigint == 1) {
455455
				say "Closing thread: " . $tid;
456456
				last;
457457
			}
458458
	}
460460
	while (!$q->pending() && !$stopping) {
461461
		yield();
462462
		goto(LOOP2) if (%large && !$busy);
463463
	}
464464
}
466466
# Subroutine for testing to see if the MD5 sums
467467
# in the database file are correct (i.e. have changed or not).
468468
# It takes 1 argument:
469469
# (1) the thread queue
470470
sub md5test {
471471
	my $q = shift;
472472
	my $tid = threads->tid();
473473
	my ($oldmd5, $newmd5);
475475
	# Loop through the thread queue.
476476
	LOOP: while ((my $fn = $q->dequeue_nb()) || !$stopping) {
477477
				if ($fn) {
478478
					say "$fn" . "\n" . "$tid:" . "\t" .
479479
						"Currently testing...\n";
481481
					$newmd5 = md5sum($fn);
482482
					$oldmd5 = $md5h{$fn};
484484
					# If the new MD5 sum doesn't match the one in the hash,
485485
					# and file doesn't already exist in the %err hash,
486486
					# log it and replace the old MD5 sum in the hash with
487487
					# the new one.
488488
					if ($newmd5 ne $oldmd5 && ! $err{$fn}) {
489489
						logger('diff', $fn);
490490
						$md5h{$fn} = $newmd5;
491491
					}
492492
					lock($n);
493493
					$n++;
494494
				} else {
495495
					yield();
496496
				}
497497
				# If the $saw_sigint variable has been tripped.
498498
				# Quit this 'while' loop, thereby closing the thread.
499499
				if ($saw_sigint == 1) {
500500
					say "Closing thread: " . $tid;
501501
					last;
502502
				}
503503
	}
505505
	while (!$q->pending() && !$stopping) {
506506
		yield();
507507
		goto(LOOP) if (%large && !$busy);
508508
	}
509509
}
511511
sub md5flac {
512512
	my $fn = shift;
513513
	my (@req, $hash);
515515
	if ($fn =~ /.flac$/i) {
517517
		if (! @req) {
518518
			chomp(@req = ( `which flac metaflac 2>&-` ));
520520
			if (! $req[0] || ! $req[1]) {
521521
				say "You need both 'flac' and 'metaflac' to test FLAC files!\n" .
522522
				"Using normal test method...\n";
523523
				@req = '0';
524524
				return;
525525
			}
526526
		}
528528
		unless ($req[0] = '0') {
529529
			chomp($hash = `metaflac --show-md5sum "$fn" 2>&-`);
530530
			if ($? != 0 && $? != 2) { logger('corr', $fn); return; }
532532
			system('flac', '--totally-silent', '--test', "$fn");
533533
			if ($? != 0 && $? != 2) { logger('corr', $fn); return; }
535535
			return $hash;
536536
		}
537537
	}
538538
}
540540
# Create the thread queue.
541541
my $q = Thread::Queue->new();
543543
# Depending on which script mode is active,
544544
# set the @run array to the correct arguments.
545545
# This will be used to start the threads later.
546546
my @run;
547547
given ($mode) {
548548
	when ('import') {
549549
		@run = (\&md5import, $q);
550550
	}
551551
	when ('index') {
552552
		@run = (\&md5index, $q);
553553
	}
554554
	when ('test') {
555555
		@run = (\&md5test, $q);
556556
	}
557557
}
559559
my @threads;
560560
foreach (1 .. $cores) {
561561
	push(@threads, threads->create(@run));
562562
}
564564
# This loop is where the actual action takes place
565565
# (i.e. where all the subroutines get called from)
566566
foreach my $dn (@lib) {
567567
	if (-d $dn) {
568568
		# Changing $dn to the absolute path.
569569
		$dn = abs_path($dn);
571571
		# Adding the current PATH to the $db variable.
572572
		$db = "$dn/$db";
574574
		# Change directory to $dn.
575575
		chdir($dn)
576576
			or die "Can't change directory to '$dn': $!";
578578
		# Start logging.
579579
		logger('start');
581581
		# If the database file is a real file,
582582
		# store it in the database hash.
583583
		file2hash() if (-f $db);
585585
		given ($mode) {
586586
			when ('import') {
587587
				#say("Enqueueing files");
588588
				# Get all the file names and put them in the queue.
589589
				foreach my $fn (sort(keys(%md5h))) { $q->enqueue($fn); }
590590
			}
591591
			when ('index') {
592592
				#say("Enqueueing files");
593593
				# Get all the file names and put them in the queue.
594594
				foreach my $fn (getfiles($dn)) {
595595
					while ($file_stack >= $disk_size) {
596596
						say("$file_stack > $disk_size, yielding");
597597
						yield();
598598
					}
599599
					# Unless file name exists in the database hash,
600600
					# continue.
601601
					unless ($md5h{$fn}) {
602602
						my $size = (stat($fn))[7];
603603
						if ($size < $disk_size) {
605605
							say("Reading file $fn");
606606
							open(FILE, '<:raw', $fn) or die "Can't open '$fn': $!";
607607
							read(FILE, $file_contents{$fn}, $size);
608608
							close(FILE) or die "Can't close '$fn': $!";
610610
							say("Queueing file $fn");
611611
							$file_stack += length($file_contents{$fn});
612612
							$q->enqueue($fn);
613613
						} else {
614614
							say("Skipping $fn");
615615
							$large{$fn} = 1;
616616
						}
617617
					}
618618
				}
619619
			}
620620
			when ('test') {
621621
				#say("Enqueueing files");
622622
				# Fetch all the keys for the database hash and put
623623
				# them in the queue.
624624
				foreach my $fn (sort(keys(%md5h))) {
625625
					while ($file_stack >= $disk_size) {
626626
						say("$file_stack > $disk_size, yielding");
627627
						yield();
628628
					}
630630
					my $size = (stat($fn))[7];
631631
					if ($size < $disk_size) {
633633
						say("Reading file $fn");
634634
						open(FILE, '<:raw', $fn) or die "Can't open '$fn': $!";
635635
						read(FILE, $file_contents{$fn}, $size);
636636
						close(FILE) or die "Can't close '$fn': $!";
638638
						$file_stack += length($file_contents{$fn});
639639
						$q->enqueue($fn);
640640
					} else {
641641
						say("Skipping $fn");
642642
						$large{$fn} = 1;
643643
					}
644644
				}
645645
			}
646646
		}
648648
		# This block is run after all the files that fit in RAM are
649649
		# done processing. This deals with the large files that
650650
		# don't fit in RAM.
651651
		if (%large) {
653653
			foreach my $fn (sort(keys(%large))) {
654654
				$parts[1] = ceil((stat($fn))[7] / $disk_size);
655655
				my $last = $parts[1];
657657
				open(FILE, '<:raw', $fn) or die "Can't open '$fn': $!";
658658
				foreach my $n (1 .. $last) {
659659
					while ($file_stack > 0) { yield(); }
660660
					say "${fn}\nPart $n of $parts[1] done ..";
661661
					if ($n == 1) {
662662
						sysread(FILE, $file_contents{$fn}, $disk_size);
664664
						lock($file_stack);
665665
						$file_stack += length($file_contents{$fn});
666666
						$parts[0] = $n;
667667
						$q->enqueue($fn);
668668
					} elsif ($n == $last) {
669669
						sysread(FILE, $file_contents{$fn}, $disk_size);
671671
						lock($file_stack);
672672
						$file_stack += length($file_contents{$fn});
673673
						$parts[0] = $n;
674674
					} else {
675675
						sysread(FILE, $file_contents{$fn}, $disk_size);
677677
						lock($file_stack);
678678
						$file_stack += length($file_contents{$fn});
679679
						$parts[0] = $n;
680680
					}
681681
				}
682682
				close(FILE) or die "Can't close '$fn': $!";
683683
				foreach (@parts) { $_ = 0; }
684684
			}
685685
		}
687687
#     use Digest::MD5;
688688
#    $md5 = Digest::MD5->new;
689689
#    $md5->add('foo', 'bar');
690690
#    $md5->add('baz');
691691
#    $digest = $md5->hexdigest;
692692
#    print "Digest is $digest\n";
695695
		$stopping = 1;
696696
		foreach my $t (threads->list()) { $t->join(); }
697697
		#say("All threads joined");
699699
		# Print the hash to the database file and close the log
700700
		hash2file();
701701
		logger('end', $n);
702702
	}
703703
}

Submit a correction or amendment below (click here to make a fresh posting)
After submitting an amendment, you'll be able to view the differences between the old and new posts easily.

Syntax highlighting:

To highlight particular lines, prefix each line with {%HIGHLIGHT}




All content is user-submitted.
The administrators of this site (kpaste.net) are not responsible for their content.
Abuse reports should be emailed to us at