#!/usr/bin/perl # -*- Mode: Perl -*- # dirsplit --- # Author : Eduard Bloch ( blade@debian.org ) # Last Modified On : Sun, 06 Feb 2005 14:59:51 +0100 # Status : Working, but use with caution! # License: GPLv2 my $version="0.3.3"; require v5.8.1; use strict; use List::Util 'shuffle'; use Getopt::Long qw(:config no_ignore_case bundling); use File::Basename; use File::Path; use Cwd 'abs_path'; my $ret=0; my $max="4488M"; my $prefix="vol_"; my $acc=20; my $emode=1; my $bsize=2048; my $ofac =50; my $opt_help; my $opt_longhelp; my $opt_sim; my $opt_dir; my $opt_flat; my $opt_move; my $opt_ver; my $opt_sln; my $opt_ln; my $opt_filter; my $opt_simple; my $opt_follow; my $get_ver; my $opt_listfile; my %options = ( "h|help" => \$opt_help, "d|dirhier" => \$opt_dir, "flat" => \$opt_flat, "f|filter=s" => \$opt_filter, "F|follow" => \$opt_follow, "e|expmode=i" => \$emode, "o|overhead=i" => \$ofac, "b|blksize=i" => \$bsize, "n|no-act" => \$opt_sim, "m|move" => \$opt_move, "l|symlink" => \$opt_sln, "L|hardlink" => \$opt_ln, "v|verbose" => \$opt_ver, "s|size=s" => \$max, "S|simple" => \$opt_simple, "T|input=s" => \$opt_listfile, "p|prefix=s" => \$prefix, "a|accuracy=i" => \$acc, "H|longhelp" => \$opt_longhelp, "version" => \$get_ver ); &show_help(1) unless ( GetOptions(%options)); &show_help(1) if $opt_help; &show_longhelp if $opt_longhelp; if($get_ver) { print $version; exit 0; } # ignore the old dirhier setting since it is default now and disable the flag when opt_flat is specified $opt_dir = !$opt_flat; $opt_ver = 1 if $opt_sim; $opt_move=1 if ($opt_sln || $opt_ln); # big list @sizes containing the "items" (object sizes) # %names hash mapping "items" (size as key) to arrays with filenames/subarrays for coalesced files my @sizes; my %names; # result containts the calculated output. In simple mode, an # array (bins) of atoms (files or filelists). Otherwise, sizes # instead of atoms, to be resolved with %names. my @result; my $inputdir; $max=fixnr($max); # about 400kB for iso headers $max-=420000; # init default value my $globwaste=0; if(-d $ARGV[0] || (-d readlink($ARGV[0]))) { syswrite(STDOUT,"Building file list, please wait...\n"); # save the absolut path before doing anyhting $inputdir=Cwd::abs_path($ARGV[0]); &explore($inputdir); } elsif($opt_listfile) { if($opt_listfile eq "-") { &parseListe(\*STDIN); } else { open(my $in, "<", $opt_listfile) || die "Cannot open list file $opt_listfile\n"; &parseListe($in); } } else { die "Error: please specify a directory\n"; } # check for pointless requests my $testsize=0; for(@sizes) { die "Too large object(s) ($_) for the given max size: @{$names{$_}} (maybe coalesced in arrays, check manually)\n" if($_>$max); $testsize+=$_; } $acc=1 if ($testsize <= $max); # just generate a list, more trials are pointless print "\nSumm: $testsize\n" if($opt_ver); die "Nothing to do!\n" if($testsize<4096); # looks like just an empty dir if(!$opt_simple) { syswrite(STDOUT, "Calculating, please wait...\n"); my $starttime=time; $globwaste=$max*@sizes; for(1..$acc) { syswrite(STDOUT,"."); my @tmp; #my $waste = bp_bestfit($max, \@in, \@tmp); my $waste = bp_firstfit($max, \@sizes, \@tmp); #print "D: waste - $waste\n"; if($waste < $globwaste) { $globwaste=$waste; @result=@tmp; } if($starttime && time > $starttime+10) { syswrite(STDOUT,"\nSpent already over 10s (for $_ iterations)\nHint: reduce accuracy to make it faster!\n"); undef $starttime; } @sizes=shuffle(@sizes); } } print "\nCalculated, using ".(scalar @result)." volumes.\n"; print "Wasted: $globwaste Byte (estimated, check mkisofs -print-size ...)\n"; # and the real work my $i=0; my $inDirLen=length($inputdir); for(@result) { $i++; my $o; open($o, ">$prefix$i.list") if(! ($opt_move || $opt_sim)); my $dirPrefix=dirname($prefix); my $prefixBase=basename($prefix); my $dirPrefixAbs=Cwd::abs_path($dirPrefix); for(@{$_}) { my $stuffRef; # For simple mode, the files/atoms are already resolved, otherwise take # the next with appropriate size. my $item= $opt_simple ? $_ : shift(@{$names{$_}}); # make reference point to an array with our files, create a list if needed if(ref($item) eq "ARRAY") { $stuffRef=$item; } else { $stuffRef=[$item]; } for my $file (@$stuffRef) { my $relFile=substr($file,$inDirLen+1); my $base=basename($relFile); if($opt_move) { my $targetsubdir = $dirPrefixAbs."/$prefixBase$i"; $targetsubdir .= "/".dirname($relFile) if($opt_dir); print "$file -> $targetsubdir/$base\n" if($opt_ver); if(!$opt_sim) { mkpath $targetsubdir || die "Problems creating $targetsubdir\n"; # last check die "Could not create $targetsubdir?\n" if(!(-d $targetsubdir && -w $targetsubdir)); if($opt_sln) { symlink($file, "$targetsubdir/$base"); } elsif($opt_ln) { if(-d $file && !-l $file) { mkdir "$targetsubdir/$base"; } else { link($file, "$targetsubdir/$base"); } } else { rename($file, "$targetsubdir/$base"); } } } else { # escape = in mkisofs catalogs, they are used as separator my $isoname = ($opt_dir?$relFile : $base); $isoname=~s/=/\\=/g; my $sourcefile=$file; $sourcefile=~s/=/\\=/g; print "$i: /$isoname=$sourcefile\n" if $opt_ver; print $o "/$isoname=$sourcefile\n" if(!$opt_sim); } } } close($o) if($o); } exit $ret; # recursive function # parameter: directory # mode 1: descend as far as possible and index all non-directories # mode 2++: # put all files of a dir into coaleseced-object, then descend into each dir sub explore { (my $dir) = @_; my @stuff; my @dirs; my @files; opendir(DIR, $dir) || die "Could not open $dir\n"; @stuff=readdir(DIR); if($opt_simple) { @stuff=sort { lc($a) cmp lc($b) } @stuff; } foreach my $f (@stuff) { next if ($f eq "." || $f eq ".."); #print "\$f=$opt_filter;\n"; $f="$dir/$f" if($dir ne "."); if ($opt_filter) { next unless (eval("\$f=~$opt_filter;")); } if(-l $f && ! $opt_follow) { push(@files, $f); } elsif(-d $f) { push(@dirs, $f); } else { push(@files, $f); } } closedir(DIR); if( (@dirs + @files) == 0 ) { # this one is empty, register for cosmetics reason &insitem(getsize($dir), $dir); return; } # recurse on directories &explore($_) for(@dirs); # and now process files if($emode==1) { &insitem(getsize($_), $_) for(@files); } else { # handle coalesced objects - first some sanity checks and splitting if # required my $filesum=0; for(@files) { my $tmp=getsize($_); if($tmp>$max) { # already too large, stop right here die "Too large file ($_) for the given max size $max, aborting...\n"; } $filesum += $tmp; }; # handle coal. objects becoming too large if($filesum>$max) { # too large coal. object... if($emode==3) { # don't coalesc in this mode, do like mode 1 above, leave them alone &insitem(getsize($_), $_) for(@files); return; } # a bit complicated, split file set while creating coal.objects if($emode==4) { my $partsum=0; my @sorted=sort(@files); my @tmpvol; for(my $i=0;$i<=$#sorted;$i++) { # print "D: i: $i, partsum: $partsum, file: $sorted[$i]\n"; my $tmp=getsize($sorted[$i]); $partsum+=$tmp; if($partsum>$max) { # undo the last step then build the coal.object $partsum-=$tmp; $i--; &insitem($partsum, \@tmpvol); # reset temporaries undef @tmpvol; undef $partsum; } else { push(@tmpvol, $sorted[$i]); } } return; } } # ok, building a coalesced object for simple cases if($filesum) { &insitem($filesum, \@files); } } } my $simplePos=0; my @simpleBinSizes; # args: size, object (filename or list reference) sub insitem { my ($size, $object) = @_; # normaly, put the items into the pool for calculation. In simple mode, calculate here push(@sizes, $size); push(@{$names{$size}},$object); if($opt_simple) { # now the simplest method to fill the bins, just take a new one when the # object-to-be-added no longer fits if($simpleBinSizes[$simplePos]+$size > $max) { $globwaste += ( $max-$simpleBinSizes[$simplePos] ); $simplePos++; }; $simpleBinSizes[$simplePos]+=$size; push( @{$result[$simplePos]}, $object); } } sub getsize { (my $file) = @_; my $size = ((stat($file))[7]); my $rest = ($size % $bsize); $size = ($size + $bsize - $rest) if ($rest); return 1+int(200 + $ofac*length(basename($file)) + $size); } sub parseListe { my $fh=${$_[0]}; while(<$fh>) { if(/^(\w+)\s+(.+)/) { &insitem(fixnr($1), $2); } } } sub fixnr { # args: # Number # optional: default multiplier my $fac; my $nr; if($_[0]=~/(\d+)(\D)/) { $nr=$1; $fac=$2; } elsif(defined($_[1])) { $nr=$_[0]; $fac=$_[1]; } else { return $_[0]; } return $nr*1000000000 if($fac eq "g"); return $nr*1073741824 if($fac eq "G"); return $nr*1000000 if($fac eq "m"); return $nr*1048576 if($fac eq "M"); return $nr*1000 if($fac eq "k"); return $nr*1024 if($fac eq "K"); return $nr if($fac eq "b"); die "$fac is not a valid multiplier!"; } sub show_help { print <<EOM dirsplit [options] [advanced options] < directory > -H|--longhelp Show the long help message with more advanced options -n|--no-act Only print the commands, no action (implies -v) -s|--size NUMBER - Size of the medium (default: $max) -e|--expmode NUMBER - directory exploration mode (recommended, see long help) -m|--move Move files to target dirs (default: create mkisofs catalogs) -p|--prefix STRING - first part of catalog/directory name (default: vol_) -h|--help Show this option summary -v|--verbose More verbosity The complete help can be displayed with the --longhelp (-H) option. The default mode is creating file catalogs useable with: mkisofs -D -r --joliet-long -graft-points -path-list CATALOG Example: dirsplit -m -s 700M -e2 random_data_to_backup/ EOM ; exit shift; } sub show_longhelp { my $msglong=" dirsplit [options] [advanced options] < directory > -n|--no-act Only print the commands, no action (implies -v) -s|--size NUMBER - Size of the medium (default: $max) -m|--move Move files to target dirs (default: create mkisofs catalogs) -l|--symlink similar to -m but just creates symlinks in the target dirs -L|--hardlink like -l but creates hardlinks -p|--prefix STRING - first part of catalog/directory name (default: vol_) -f|--filter EXPR - Filter expression, see examples below and perlre manpage --flat Flat dir mode, don't recreate subdirectory structure (not recommended) -e|--expmode NUMBER, special exploration modes, used with directory argument 1: (default) native exploration of the specified directory, but file sizes are rounded up to 2048 blocks plus estimated overhead for filenames (see -o option) 2: like 1, but all files in directory are put together (as \"atom\") onto the same medium. This does not apply to subdirectories, however. 3: like 2, but don't coalesc files when the size of the \"atom\" becomes too large for the medium size (currently $max) 4: like 2, but the max. size of the atoms is limited to $max (storing the rest on another medium) -F|--follow Follow symlinks. Use with care! -b|--blksize NUMBER, block size of the target filesystem (currently $bsize). -o|--overhead NUMBER, overhead caused by directory entries (as factor for the filename length, default: 50, empiricaly found for Joliet+RR with not-so-deep directory structure). Works in exploration mode. -a|--accuracy NUMBER (1=faster, large number=better efficiency, default: 500) -S|--simple Simple/stupid/alphabetic mode -T|--input FILENAME (or - for STDIN): List with sizes and paths, try: find dir -type f -printf \"%s %p\n\" to get an example. Avoid duplicates! Unit suffixes are allowed. -h|--help Show this option summary -v|--verbose More verbosity File sizes are expected to be in bytes, append modifier letters to multiply with a factor, eg 200M (b,k,K,m,M,g,G for Bytes, Kb, KiB, Mb, MiB, Gb, GiB). The default output mode is creating file catalogs useable with mkisofs -D -r --joliet-long -graft-points -path-list CATALOG Examples: dirsplit -m -s 120M -e4 largedirwithdata/ -p /zipmedia/backup_ #move stuff into splitted backup dirs dirsplit -s 700M -e2 music/ # make mkisofs catalogs to burn all music to 700M CDRs, keep single files in each dir together dirsplit -s 700M -e2 -f '/other\\/Soundtracks/' music/ # like above, only take files from other/Soundtracks dirsplit -s 700M -e2 -f '!/Thumbs.db|Desktop.ini|\\.m3u\$/i' # like above, ignore some junk files and playlists, both letter cases Bugs: overhead trough blocksize alignment and directory entry storage varies, heavily depends on the target filesystem and configuration (see -b and -o). You should compare the required size of the created catalogs, eg.: for x in *list ; do mkisofs -quiet -D -r --joliet-long -graft-points \\ -path-list \$x -print-size; done (output in blocks of 2048 bytes) with the expected size (-s) and media data (cdrecord -v -toc ...). "; print $msglong; exit 0; } # Parms: bin size (int), input array (arr reference), output array (arr reference) # Returns: wasted space (int) sub bp_bestfit { my $max=$_[0]; my @in = @{$_[1]}; my $target = $_[2]; my @out; my @bel; my @tmp; push(@tmp,$in[0]); push(@out, \@tmp); $bel[0] = $in[0]; shift @in; for(@in) { my $bestplace=$#out+1; my $bestwert=$max; for($i=0;$i<=$#out;$i++) { my $rest; $rest=$max-$bel[$i]-$_; if($rest>0 && $rest < $bestwert) { $bestplace=$i; $bestwert=$rest; }; } if($bestplace>$#out) { my @bin; $bel[$bestplace]=$_; push(@bin, $_); push(@out,\@bin); } else{ $bel[$bestplace]+=$_; push( @{$out[$bestplace]} , $_); } } my $ret=0; # count all rests but the last one for($i=0;$i<$#out;$i++) { $ret+=($max-$bel[$i]); } @{$target} = @out; return $ret; } # Parms: bin size (int), input array (arr reference), output array (arr reference) # Returns: wasted space (int) sub bp_firstfit { my $max=$_[0]; my @in = @{$_[1]}; my $target = $_[2]; my @out; my @bel; piece: foreach my $obj (@in) { # first fit, use the first bin with enough free space # print "F: bin$i: $obj, @{$names{$obj}}\n"; for($i=0;$i<=$#out;$i++) { my $newsize=($bel[$i]+$obj); # print "bel[i]: $bel[$i], new?: $newsize to max: $max\n"; if( $newsize <= $max ) { # print "F: bin$i: $bel[$i]+$obj=$newsize\n"; #fits here $bel[$i]=$newsize; push( @{$out[$i]} , $obj); next piece; # break } } # neues Bin my @bin; $bel[$i]=$obj; # print "N: bin$i: $bel[$i]=$obj\n"; push(@bin, $obj); push(@out,\@bin); } my $ret=0; # sum up all rests except of the one from the last bin for($i=0;$i<$#out;$i++) { # print "hm, bel $i ist :".$bel[$i]." und res:".($max-$bel[$i])."\n"; $ret+=($max-$bel[$i]); } @{$target} = @out; # print "wtf, ".join(",", @{$out[0]})."\n"; return $ret; }