Skip to content

Instantly share code, notes, and snippets.

@dlinsley
Last active September 28, 2023 17:18
Show Gist options
  • Save dlinsley/19632c38f660b236ba3f8656b76b2496 to your computer and use it in GitHub Desktop.
Save dlinsley/19632c38f660b236ba3f8656b76b2496 to your computer and use it in GitHub Desktop.

Revisions

  1. dlinsley revised this gist Dec 31, 2019. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions find_duplicate_objects.py
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,4 @@
    #!/usr/bin/env python3
    import boto3
    import argparse
    import string
  2. dlinsley revised this gist Oct 20, 2018. 1 changed file with 8 additions and 1 deletion.
    9 changes: 8 additions & 1 deletion find_duplicate_objects.py
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,13 @@
    import boto3
    import argparse
    import string

    myBucket = 'yourBucketName'
    parser = argparse.ArgumentParser('Find duplicate objects in an aws s3 bucket')
    parser.add_argument('--bucket', dest='myBucket', default='yourBucketName', help='S3 Bucket to search')

    cliArgs = parser.parse_args()

    myBucket = cliArgs.myBucket

    # each list_objects_v2 request will return up to 1000 objects.
    # We will loop for every 1000, make another list_objects_v2 until end of bucket is reached
  3. dlinsley revised this gist Oct 17, 2017. 1 changed file with 5 additions and 5 deletions.
    10 changes: 5 additions & 5 deletions find_duplicate_objects.py
    Original file line number Diff line number Diff line change
    @@ -13,8 +13,8 @@

    s3 = boto3.client('s3')

    print 'searching for duplicate objects'
    print
    print('searching for duplicate objects')
    print('')

    while lastReqLength == 1000:
    if (lastKey == ""):
    @@ -28,10 +28,10 @@
    thisSize = obj['Size']
    thisEtag = obj['ETag']
    if thisSize > 0:
    if existing.has_key(thisEtag):
    if thisEtag in existing:
    #duplicate found:
    print '!!Duplicate: -',existing[thisEtag],' - ',thisKey
    print('!!Duplicate: - %s - %s' % (existing[thisEtag], thisKey))
    else:
    existing[thisEtag] = thisKey

    print '... The End.'
    print('... The End.')
  4. dlinsley created this gist Oct 14, 2017.
    37 changes: 37 additions & 0 deletions find_duplicate_objects.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,37 @@
    import boto3

    myBucket = 'yourBucketName'

    # each list_objects_v2 request will return up to 1000 objects.
    # We will loop for every 1000, make another list_objects_v2 until end of bucket is reached
    lastReqLength = 1000

    # at the end of each 1000, know the last key so we can get the next 1000 after it
    lastKey = ""

    existing = {}

    s3 = boto3.client('s3')

    print 'searching for duplicate objects'
    print

    while lastReqLength == 1000:
    if (lastKey == ""):
    myObjects = s3.list_objects_v2(Bucket=myBucket)
    else:
    myObjects = s3.list_objects_v2(Bucket=myBucket,StartAfter=lastKey)
    lastReqLength = len(myObjects['Contents'])
    for obj in myObjects['Contents']:
    lastKey = obj['Key']
    thisKey = obj['Key']
    thisSize = obj['Size']
    thisEtag = obj['ETag']
    if thisSize > 0:
    if existing.has_key(thisEtag):
    #duplicate found:
    print '!!Duplicate: -',existing[thisEtag],' - ',thisKey
    else:
    existing[thisEtag] = thisKey

    print '... The End.'