diff --git a/tools/zanitizer b/tools/zanitizer new file mode 100755 index 0000000000..b1e6d47855 --- /dev/null +++ b/tools/zanitizer @@ -0,0 +1,172 @@ +#!/usr/bin/perl + +# usage: +# git clone --bare git@git.zulip.net:eng/zulip.git +# cd zulip.git +# git fast-export --export-marks=/tmp/em --progress=1000 -- --all > /tmp/fe +# git init --bare ../zulip-zanitized.git +# cd ../zulip-zanitized.git +# zanitizer /tmp/fe /tmp/em | git fast-import --quiet + +use strict; +use warnings; + +use Digest::SHA qw(sha1_hex); +use FindBin; + +use lib $FindBin::Bin; +use zanitizer_config; + +sub eq_tree { + my ($a, $b) = @_; + !(grep{!exists $$b{$_} || $$a{$_} ne $$b{$_}} keys %$a) && + !(grep {!exists $$a{$_}} keys %$b) +} + +my ($fast_export_file, $export_marks_file) = @ARGV; + +my %export_marks = (); +if (defined $export_marks_file) { + open EXPORT_MARKS, '<', $export_marks_file or die "cannot open $export_marks_file: $!"; + %export_marks = map {split} ; + close EXPORT_MARKS; +} + +my %mark_map = (); +my %blob_mark = (); +my %ref_commit = (); +my %commit_tree = (); +my %scrubbed_blob = (); +my %scrubbed_file = (); +my %deleted_file = (); + +open FAST_EXPORT, '<', $fast_export_file or die "cannot open $fast_export_file: $!"; +$_ = ; +while (defined $_) { + if ($_ eq "blob\n") { + my ($mark) = =~ /^mark (\S*)\n$/s or die; + my ($len) = =~ /^data (\d+)\n$/s or die; + read(FAST_EXPORT, my $data, $len) == $len or die; + $_ = $data; + scrub_text; + if ($_ ne $data) { + $scrubbed_blob{$mark} = 1; + $data = $_; + } + eq "\n" or die; + + my $hash = sha1_hex($data); + if (exists $blob_mark{$hash}) { + $mark_map{$mark} = $blob_mark{$hash}; + } else { + $blob_mark{$hash} = $mark_map{$mark} = $mark; + print "blob\nmark $mark\ndata ", length $data, "\n", $data, "\n"; + } + } elsif (/^reset (?'ref'.*)\n$/s) { + my $ref = $+{ref}; + $_ = ; + my $from = undef; + while (1) { + if ($_ eq "\n") { + $_ = ; + last; + } elsif (my ($from_) = /^from (?'from'.*)\n$/s) { + $from = $+{from}; + } else { + # The trailing LF on reset is optional + last; + } + $_ = ; + } + + $ref_commit{$ref} = $mark_map{from}; + print "reset $ref\n"; + print "from $mark_map{$from}\n" if defined $from && defined $mark_map{$from}; + print "\n"; + + next; + } elsif (/^commit (?'ref'.*)\n$/s) { + my $ref = $+{ref}; + my ($mark) = =~ /^mark (\S*)\n$/s or die; + my ($author) = =~ /^author (.*)\n$/s or die; + my ($committer) = =~ /^committer (.*)\n$/s or die; + my ($len) = =~ /^data (\d+)\n$/s or die; + read FAST_EXPORT, my ($data), $len; + $_ = ; + my $from = undef; + if (/^from (?'from'.*)\n$/s) { + $from = $+{from}; + $_ = ; + } + my $base = defined $from ? $mark_map{$from} : $ref_commit{ref}; + my @merge = (); + while (/^merge (?'mark'\S*)\n$/s) { + die "unimplemented case" if !defined $from; + push @merge, $+{mark}; + $_ = ; + } + # git fast-export incorrectly writes M before D when replacing + # a symlink with a directory. We move every D before every M + # to work around this bug. + my @delete = (); + my @modify = (); + while (1) { + if ($_ eq "\n") { + last; + } elsif (/^D (?'file'.*)\n$/s) { + $_ = $+{file}; + push @delete, {%+} if keep_file; + } elsif (/^M (?'mode'\d+) (?'mark'\S+) (?'file'.*)\n$/s) { + $_ = $+{file}; + if (keep_file) { + $scrubbed_file{$+{file}} = 1 if exists $scrubbed_blob{$+{mark}}; + push @modify, {%+}; + } else { + $deleted_file{$+{file}} = 1; + } + } else { + die "unhandled command in commit: $_"; + } + $_ = ; + } + my $base_tree = defined $base ? $commit_tree{$base} : {}; + my %tree = %$base_tree; + delete $tree{$$_{file}} for @delete; + $tree{$$_{file}} = "$$_{mode} $mark_map{$$_{mark}}" for @modify; + + if (eq_tree(\%tree, $base_tree) && !(grep {defined $mark_map{$_}} @merge)) { + $ref_commit{$ref} = $mark_map{$mark} = $base; + } else { + $ref_commit{$ref} = $mark_map{$mark} = $mark; + $commit_tree{$mark} = \%tree; + $_ = $data; + scrub_text; + if (exists $export_marks{$mark}) { + $_ .= "\n" until /\n\n$/; + $_ .= "(imported from commit $export_marks{$mark})\n"; + } + print "commit $ref\nmark $mark\nauthor $author\ncommitter $committer\ndata ", length $_, "\n", $_; + if (defined $from) { + die "unimplemented case" if !defined $mark_map{$from}; + print "from $mark_map{$from}\n"; + } + for (@merge) { + print "merge $mark_map{$_}\n" if defined $mark_map{$_}; + } + print "D $$_{file}\n" for @delete; + print "M $$_{mode} $mark_map{$$_{mark}} $$_{file}\n" for @modify; + print "\n"; + } + } elsif (/^progress /) { + print $_; + } else { + die "unhandled command: $_"; + } + $_ = ; +} +close FAST_EXPORT; + +print STDERR "Deleted files:\n"; +print STDERR " $_\n" for sort keys %deleted_file; +print STDERR "Scrubbed files:\n"; +print STDERR " $_\n" for sort keys %scrubbed_file; diff --git a/tools/zanitizer_config.pm.sample b/tools/zanitizer_config.pm.sample new file mode 100644 index 0000000000..a5b0f70de5 --- /dev/null +++ b/tools/zanitizer_config.pm.sample @@ -0,0 +1,15 @@ +use strict; +use warnings; + +sub scrub_text { + return if /^\x89PNG/ || /^PK\x03\x04/; + + s/opensesame/xxxxxxxxxx/g; + s/hunter2/xxxxxxx/g; +} + +sub keep_file { + !m%^secret-directory/% && !m%settings\.ini$% +} + +1;