shift/BUGS

KNOWN BUGS
==========

1. A bug exists in the Shift 4.0 and 5.0 tar creation function that
   could leave some entries in tar files in a partially corrupted state.
   The conditions under which this could occur are very specific, so the
   overall percentage of affected tar files is expected to be very low.

   To be affected, a directory name within the tar has to be either
   (1) a multiple of 512 characters in length or (2) between 155-255
   characters in length with a slash at the 100th character from the
   end.  In the first case, the 512-byte header of the entry immediately
   following the directory entry would be corrupted.  In the second
   case, the 512-byte header as well as an additional 512 bytes (which
   may be an additional header, a different header, or the initial 512
   bytes of a file's data) would be corrupted.  All other entries and
   data in affected tar files will be intact and can be extracted
   normally.

   Because Shift validates tar entries at the end of tar creation,
   most transfers in which these conditions were met would have
   initially ended in an "error" state with one of the following
   messages:

    - Invalid tar header checksum
    - Invalid tar long link/name data

   Note, however, that any transfers that were restarted (via --restart)
   after the issue occurred are likely to have completed without further
   error, even though the corrupted file(s) would not have been
   repaired by the restart operation.

   To help determine whether a given tar file has been impacted by this
   bug, the perl code following this description can be saved to a file
   (e.g. tarcheck.pl) and run on any tar file.  Note that the results
   of this tool are only meaningful for tar files created with Shift
   versions 4.0 or 5.0.  Run the tool as follows:

    tarcheck.pl data1.tar data2.tar ... dataN.tar

   The tool will first attempt to check files using an associated table
   of contents (.toc) file.  If no such file is found (which normally
   occurs only when Shift is invoked without --index-tar), it will then
   analyze the actual tar entries to search for corrupted entries.

   Files with affected entries found in .toc files will report a message
   similar to the following:

    ONE OF HEADER OF

      /some/file/1

    OR HEADER AND FIRST 512 BYTES OF

      /some/file/2

    IS AFFECTED

   Files with affected entries that do not have a corresponding .toc
   file will report one of the following messages:

    - Invalid tar record at byte N
    - Invalid tar header checksum

   If neither type of message is displayed, the tar file is not
   affected.


##############################
#### BEGIN TAR CHECK CODE ####
##############################

#!/usr/bin/perl
# this program checks one or more tar files given on the command line
# for the Shift tar corruption problem in versions 4.0 and 5.0

use strict;

my $force = shift @ARGV;
if ($force ne '-f' && $force ne '-t') {
    unshift(@ARGV, $force);
    $force = undef;
}

foreach my $tar (@ARGV) {
    print "$tar:\n";
    if (! -e $tar) {
        print "    ...does not exist\n";
    } elsif (-e "$tar.toc" && $force ne '-f') {
        print "    ...reading toc file $tar.toc\n";
        check_toc("$tar.toc");
    } elsif ($force ne '-t') {
        print "    ...reading contents of $tar\n";
        find_tar($tar);
    }
}

sub check_toc {
    my $toc = shift;
    if (open(TOC, '<', $toc)) {
        my $diff;
        while (<TOC>) {
            chomp;
            my @cols = split(/\s+/);
            my $name = join(" ", @cols[7 .. scalar(@cols) - 1]);
            if ($diff) {
                print " OF\n\n\t\t$name\n\n\tIS AFFECTED\n";
                $diff = 0;
            }
            next if ($cols[0] !~ /^d/);

            my %size;
            foreach my $tar_name ($name, $name . "/") {
                if (length($tar_name) > 100) {
                    my $pos = index($tar_name, "/", length($tar_name) - 100);
                    if ($pos == -1 || $pos > 155 || length($tar_name) > 255) {
                        # add size of long name plus extra record
                        my $asize = 512 + length($tar_name) + 512;
                        $asize += (512 - ($asize % 512)) if ($asize % 512 > 0);
                        $size{$tar_name} = $asize;
                        next;
                    }
                }
                $size{$tar_name} = 512;
            }
            $diff = $size{$name . "/"} - $size{$name};
            if ($diff) {
                print "\n\tONE OF HEADER OF\n\n\t\t$name\n\n\tOR HEADER";
                print " AND FIRST ", $diff - 512, " BYTES" if ($diff > 512);
            }
        }
        close TOC;
    } else {
        print "    ERROR: unable to open toc file $toc\n";
    }
}

# output list of files/dirs within given files with stat info
# based on Tar/Archive::Tar 0.07 by Calle Dybedahl (no license specified)
sub find_tar {
    my $spath = shift;

    my $fh;
    $fh = undef if (!open($fh, '<', $spath));

    my $tell = 0;
    if (!$fh) {
        print "Unable to open tar file $spath\n";
        return;
    }
    binmode $fh;

    my %real;
    my ($eof, $head);
    read($fh, $head, 512);
    while (length($head) == 512) {
        # end of archive is two blocks of 512 but GNU tar uses one sometimes
        if ($head eq "\0" x 512) {
            $eof = 1;
            last;
        }

        # uid, gid, and size must be 'a' instead of 'A' for base-256 encoding
        # name, lnk, mgc, unam, gnam, and pfx are 'Z' for trailing whitespace
        my @attrs = unpack('Z100A8a8a8a12A12A8A1Z100Z6A2Z32Z32A8A8Z155', $head);
        # name mode uid gid size time sum type lnk mgc ver unam gnam dmj dmn pfx
        #  0    1    2   3   4    5    6   7    8   9   10  11   12   13  14  15

        # prepend prefix to name
        if ($attrs[15]) {
            $attrs[0] = $attrs[15] . "/" . $attrs[0];
            $attrs[15] = "";
        }
        # remove last non-standalone slash
        $attrs[0] =~ s/(?!^)\/$//;

        if (!$attrs[0]) {
            print "Empty file name in tar file $spath\n";
            # read next header
            read($fh, $head, 512);
            next;
        }
        # old GNU tar may have space after ustar
        if ($attrs[9] ne 'ustar' && $attrs[9] ne 'ustar ') {
            if ($tell == 0) {
                print "Not in supported ustar format\n";
                return;
            }
            print "Invalid tar record at byte ", tell($fh) - 512, "\n";
            # read next header
            read($fh, $head, 512);
            next;
        }

        # convert octal numeric fields
        $attrs[$_] = oct($attrs[$_]) foreach (1, 5, 6, 13, 14);

        # handle GNU large uid/gid/size extension (two's-complement base-256)
        foreach my $i (2 .. 4) {
            if (substr($attrs[$i], 0, 1) eq "\x80") {
                my $val = ord(substr($attrs[$i], 1, 1)) & 0xff;
                for (2 .. ($i == 4 ? 11 : 7)) {
                    $val <<= 8;
                    $val |= (ord(substr($attrs[$i], $_, 1)) & 0xff);
                }
                $attrs[$i] = $val;
            } else {
                $attrs[$i] = oct $attrs[$i];
            }
        }

        # validate checksum
        substr($head, 148, 8) = "        ";
        if (unpack("%16C*", $head) != $attrs[6]) {
            print "Invalid tar header checksum for $attrs[0]\n";
            # read next header
            read($fh, $head, 512);
            next;
        }

        # handle GNU long names
        if ($attrs[7] =~ /^[LK]$/) {
            do {
                # read next header
                read($fh, $head, 512);
                $head = substr($head, 0, $attrs[4]) if ($attrs[4] < 512);
                # remove the extra byte used for \0
                $head =~ s/\0$//;
                $real{$attrs[7]} .= $head;
                $attrs[4] -= 512;
            } while ($attrs[4] > 0);
            # read next header
            read($fh, $head, 512);
            next;
        }

        # find next header
        my $offset = tell($fh);
        if (!seek($fh, $attrs[4], 1)) {
            print "Unable to seek in tar file $spath\n";
            last;
        }
        my $diff = $attrs[4] % 512;
        # ignore padding
        if ($diff != 0 && !seek($fh, 512 - $diff, 1)) {
            print "Unable to ignore padding in tar file $spath\n";
            last;
        }
        $tell = $offset + $attrs[4] + ($diff ? 512 - $diff : 0);

        if ($real{L}) {
            $attrs[0] = $real{L};
            $real{L} = undef;
        }
        if ($real{K}) {
            $attrs[8] = $real{K};
            $real{K} = undef;
        }

        # read next header
        read($fh, $head, 512);

        if ($attrs[0] eq '././@LongLink') {
            print "Dangling long link/name record\n";
            next;
        }
        my $udst = tar_canonpath($attrs[0]);
        substr($udst, 0, 0) = "/" if ($udst !~ /^\//);
    }
    if (length($head) < 512) {
        print "Unable to read header at offset $tell in tar file $spath\n";
    }
    close $fh;
}

# return given path logically cleaned of . and .. and stripped of leading ..
sub tar_canonpath {
    my $path = shift;
    my $abs = $path =~ /^\//;
    my @dirs = File::Spec->splitdir($path);
    for (my $i = 0; $i < scalar(@dirs); $i++) {
        if ($dirs[$i] eq '.' || $dirs[$i] eq '') {
            # ./foo becomes foo, foo//bar becomes foo/bar
            splice(@dirs, $i--, 1);
        } elsif ($dirs[$i] ne '..' && $dirs[$i + 1] eq '..') {
            # foo/../bar becomes bar
            splice(@dirs, $i, 2);
            $i -= 2;
        }
    }
    # remove leading ..
    shift @dirs while ($dirs[0] eq '..');
    # make path absolute if it was originally
    unshift(@dirs, "/") if ($abs);
    return File::Spec->catdir(@dirs);
}

# return uri-unescaped version of given string
sub unescape {
    my $text = shift;
    $text =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg if (defined $text);
    return $text;
}

############################
#### END TAR CHECK CODE ####
############################