#!/usr/bin/perl # Copyright (c) 2010 Jonathan Kamens. # Released under the GNU General Public License, Version 3. # See . # $Id: delete-s3-bucket.pl,v 1.5 2010/10/26 14:11:09 jik Exp $ # Deleting an Amazon S3 bucket is hard. # # * You can't delete the bucket unless it is empty. # # * There is no API for telling Amazon to empty the bucket, so you have to # delete all of the objects one by one yourself. # # * If you've recently added a lot of large objects to the bucket, then they # may not all be visible yet on all S3 servers. This means that even after the # server you're talking to thinks all the objects are all deleted and lets you # delete the bucket, additional objects can continue to propagate around the S3 # server network. If you then recreate the bucket with the same name, those # additional objects will magically appear in it! # # It is not clear to me whether the bucket delete will eventually propagate to # all of the S3 servers and cause all the objects in the bucket to go away, but # I suspect it won't. I also suspect that you may end up continuing to be # charged for these phantom objects even though the bucket they're in is no # longer even visible in your S3 account. # # * If there's a CR, LF, or CRLF in an object name, then it's sent just that # way in the XML that gets sent from the S3 server to the client when the # client asks for a list of objects in the bucket. Unfortunately, the XML # parser on the client will probably convert it to the local line ending # character, and if it's different from the character that's actually in the # object name, you then won't be able to delete it. Ugh! This is a bug in the # S3 protocol; it should be enclosing the object names in CDATA tags or # something to protect them from being munged by the XML parser. # # Note that this bug even affects the AWS Web Console provided by Amazon! # # * If you've got a whole lot of objects and you serialize the delete process, # it'll take a long, long time to delete them all. use threads; use strict; use warnings; # Keys can have newlines in them, which screws up the communication # between the parent and child processes, so use URL encoding to deal # with that. use CGI qw(escape unescape); # Easiest place to get this functionality. use File::Basename; use Getopt::Long; use Net::Amazon::S3; my $whoami = basename $0; my $usage = "Usage: $whoami [--help] --access-key-id=id --secret-access-key=key --bucket=name [--processes=#] [--wait=#] [--nodelete] Specify --processes to indicate how many deletes to perform in parallel. You're limited by RAM (to hold the parallel threads) and bandwidth for the S3 delete requests. Specify --wait to indicate seconds to require the bucket to be verified empty. This is necessary if you create a huge number of objects and then try to delete the bucket before they've all propagated to all the S3 servers (I've seen a huge backlog of newly created objects take *hours* to propagate everywhere). See the comment at the top of the script for more information about this issue. Specify --nodelete to empty the bucket without actually deleting it.\n"; my($aws_access_key_id, $aws_secret_access_key, $bucket_name, $wait); my $procs = 1; my $delete = 1; die if (! GetOptions( "help" => sub { print $usage; exit; }, "access-key-id=s" => \$aws_access_key_id, "secret-access-key=s" => \$aws_secret_access_key, "bucket=s" => \$bucket_name, "processes=i" => \$procs, "wait=i" => \$wait, "delete!" => \$delete, )); die if (! ($aws_access_key_id && $aws_secret_access_key && $bucket_name)); my $increment = 0; print "Incrementally deleting the contents of $bucket_name\n"; $| = 1; my(@procs, $current); for (1..$procs) { my($read_from_parent, $write_to_child); my($read_from_child, $write_to_parent); pipe($read_from_parent, $write_to_child) or die; pipe($read_from_child, $write_to_parent) or die; threads->create(sub { close($read_from_child); close($write_to_child); my $old_select = select $write_to_parent; $| = 1; select $old_select; &child($read_from_parent, $write_to_parent); }) or die; close($read_from_parent); close($write_to_parent); my $old_select = select $write_to_child; $| = 1; select $old_select; push(@procs, [$read_from_child, $write_to_child]); } my $s3 = Net::Amazon::S3->new({aws_access_key_id => $aws_access_key_id, aws_secret_access_key => $aws_secret_access_key, retry => 1, }); my $bucket = $s3->bucket($bucket_name); my $deleted = 1; my $total_deleted = 0; my $last_start = time; my($start, $waited); while ($deleted > 0) { $start = time; print "\nLoading ", ($increment ? "up to $increment" : "as many as possible")," keys...\n"; my $response = $bucket->list({$increment ? ('max-keys' => $increment) : ()}) or die $s3->err . ": " . $s3->errstr . "\n"; $deleted = scalar(@{ $response->{keys} }) ; if (! $deleted) { if ($wait and ! $waited) { my $delta = $wait - ($start - $last_start); if ($delta > 0) { print "Waiting $delta second(s) to confirm bucket is empty\n"; sleep($delta); $waited = 1; $deleted = 1; next; } else { last; } } else { last; } } else { $waited = undef; } $total_deleted += $deleted; print "\nDeleting $deleted keys ($total_deleted total)...\n"; $current = 0; foreach my $key ( @{ $response->{keys} } ) { my $key_name = $key->{key}; while (! &send(escape($key_name) . "\n")) { print "Thread $current died\n"; die "No threads left\n" if (@procs == 1); if ($current == @procs-1) { pop @procs; $current = 0; } else { $procs[$current] = pop @procs; } } $current = ($current + 1) % @procs; threads->yield(); } print "Sending sync message\n"; for ($current = 0; $current < @procs; $current++) { if (! &send("\n")) { print "Thread $current died sending sync\n"; if ($current = @procs-1) { pop @procs; last; } $procs[$current] = pop @procs; $current--; } threads->yield(); } print "Reading sync response\n"; for ($current = 0; $current < @procs; $current++) { if (! &receive()) { print "Thread $current died reading sync\n"; if ($current = @procs-1) { pop @procs; last; } $procs[$current] = pop @procs; $current--; } threads->yield(); } } continue { $last_start = $start; } if ($delete) { print "Deleting bucket...\n"; $bucket->delete_bucket or die $s3->err . ": " . $s3->errstr; print "Done.\n"; } sub send { my($str) = @_; my $fh = $procs[$current]->[1]; print($fh $str); } sub receive { my $fh = $procs[$current]->[0]; scalar <$fh>; } sub child { my($read, $write) = @_; threads->detach(); my $s3 = Net::Amazon::S3->new({aws_access_key_id => $aws_access_key_id, aws_secret_access_key => $aws_secret_access_key, retry => 1, }); my $bucket = $s3->bucket($bucket_name); while (my $key = <$read>) { if ($key eq "\n") { print($write "\n") or die; next; } chomp $key; $key = unescape($key); if ($key =~ /[\r\n]/) { my(@parts) = split(/\r\n|\r|\n/, $key, -1); my(@guesses) = shift @parts; foreach my $part (@parts) { @guesses = (map(($_ . "\r\n" . $part, $_ . "\r" . $part, $_ . "\n" . $part), @guesses)); } foreach my $guess (@guesses) { if ($bucket->get_key($guess)) { $key = $guess; last; } } } $bucket->delete_key($key) or die $s3->err . ": " . $s3->errstr . "\n"; print "."; threads->yield(); } return; }