Skip to content

Instantly share code, notes, and snippets.

@azam
Forked from hugs/archive-twitpic-data.py
Last active August 29, 2015 14:06
Show Gist options
  • Select an option

  • Save azam/4de9426ff3beb379ae9d to your computer and use it in GitHub Desktop.

Select an option

Save azam/4de9426ff3beb379ae9d to your computer and use it in GitHub Desktop.

Revisions

  1. azamshul revised this gist Sep 6, 2014. 1 changed file with 46 additions and 31 deletions.
    77 changes: 46 additions & 31 deletions archive-twitpic-data.py
    Original file line number Diff line number Diff line change
    @@ -7,29 +7,44 @@

    import shutil
    import urllib2
    import socket
    import json
    import time
    import os

    USERNAME = "your_username_goes_here"
    TMP_FILE_NAME = "tmpfile"
    MAX_RETRIES = 5
    SLEEP_TIME = 2
    TIMEOUT = 5000

    page = 1
    has_more_page = True
    photo_count = -1
    processed_photo_count = 0

    # Target Page
    api = "https://api.twitpic.com/2/users/show.json?username=%s&page=" % USERNAME

    # Get the data about the target page
    while has_more_page:
    print "Processing page: " + str(page)
    try:
    raw_data = urllib2.urlopen(api + str(page))
    except urllib2.URLError, e:
    print "Failed retrieving page: " + str(page)
    break

    has_page_error = True
    for i in range(MAX_RETRIES):
    try:
    raw_data = urllib2.urlopen(api + str(page), timeout=TIMEOUT)
    has_page_error = False
    if i > 0:
    print "Retry successful page: " + str(page)
    break
    except urllib2.URLError, e:
    print "Failed retrieving page: " + str(page)
    time.sleep(SLEEP_TIME)
    except socket.timeout:
    print "Timeout retrieving page: " + str(page)
    time.sleep(SLEEP_TIME)
    if has_page_error:
    has_more_page = False
    break

    json_data = json.load(raw_data)

    @@ -41,12 +56,6 @@
    # Get the info about each image on the page
    images = json_data["images"]

    # Update photo count
    photo_count = int(json_data["photo_count"])
    processed_photo_count += len(images)

    # Check if there is more page
    has_more_page = processed_photo_count < photo_count
    page += 1

    for item in images:
    @@ -57,24 +66,30 @@
    file_name = file_id + "." + file_type

    if not os.path.exists(file_name):
    # Remove temp file if exists
    try:
    os.remove(TMP_FILE_NAME)
    except OSError:
    pass
    for i in range(MAX_RETRIES):
    # Remove temp file if exists
    try:
    os.remove(TMP_FILE_NAME)
    except OSError:
    pass
    try:
    # Save the file to temporary file
    req = urllib2.urlopen(file_url, timeout=TIMEOUT)
    with open(TMP_FILE_NAME, "wb") as tmp_file:
    shutil.copyfileobj(req, tmp_file)

    try:
    # Save the file to temporary file
    req = urllib2.urlopen(file_url)
    with open(TMP_FILE_NAME, "wb") as tmp_file:
    shutil.copyfileobj(req, tmp_file)
    # Rename to actual file
    os.rename(TMP_FILE_NAME, file_name)

    # Rename to actual file
    os.rename(TMP_FILE_NAME, file_name)
    # Set the file time
    os.utime(file_name,(file_time, file_time))

    # Set the file time
    os.utime(file_name,(file_time, file_time))
    except urllib2.URLError, e:
    print "Failed retrieving image ID: " + file_id
    else:
    print "Skipped image ID: " + file_id
    if i > 0:
    print "Retry successful for image ID: " + file_id
    break
    except urllib2.URLError, e:
    print "Failed retrieving image ID: " + file_id
    time.sleep(SLEEP_TIME)
    except socket.timeout:
    print "Timeout retrieving image ID: " + file_id
    time.sleep(SLEEP_TIME)
  2. azamshul revised this gist Sep 5, 2014. 1 changed file with 20 additions and 10 deletions.
    30 changes: 20 additions & 10 deletions archive-twitpic-data.py
    Original file line number Diff line number Diff line change
    @@ -5,7 +5,7 @@
    #
    # License: MIT

    import urllib
    import shutil
    import urllib2
    import json
    import time
    @@ -25,7 +25,12 @@
    # Get the data about the target page
    while has_more_page:
    print "Processing page: " + str(page)
    raw_data = urllib2.urlopen(api + str(page))
    try:
    raw_data = urllib2.urlopen(api + str(page))
    except urllib2.URLError, e:
    print "Failed retrieving page: " + str(page)
    break

    json_data = json.load(raw_data)

    # Save the page data
    @@ -58,13 +63,18 @@
    except OSError:
    pass

    # Save the file to temporary file
    urllib.urlretrieve (file_url, TMP_FILE_NAME)

    # Rename to actual file
    os.rename(TMP_FILE_NAME, file_name)

    # Set the file time
    os.utime(file_name,(file_time, file_time))
    try:
    # Save the file to temporary file
    req = urllib2.urlopen(file_url)
    with open(TMP_FILE_NAME, "wb") as tmp_file:
    shutil.copyfileobj(req, tmp_file)

    # Rename to actual file
    os.rename(TMP_FILE_NAME, file_name)

    # Set the file time
    os.utime(file_name,(file_time, file_time))
    except urllib2.URLError, e:
    print "Failed retrieving image ID: " + file_id
    else:
    print "Skipped image ID: " + file_id
  3. azamshul revised this gist Sep 5, 2014. 1 changed file with 32 additions and 7 deletions.
    39 changes: 32 additions & 7 deletions archive-twitpic-data.py
    Original file line number Diff line number Diff line change
    @@ -12,14 +12,19 @@
    import os

    USERNAME = "your_username_goes_here"
    NUMBER_OF_PAGES_TO_DOWNLOAD = 5
    TMP_FILE_NAME = "tmpfile"

    page = 1
    has_more_page = True
    photo_count = -1
    processed_photo_count = 0

    # Target Page
    api = "https://api.twitpic.com/2/users/show.json?username=%s&page=" % USERNAME

    # Get the data about the target page
    for page in range(1, NUMBER_OF_PAGES_TO_DOWNLOAD+1):
    print page
    while has_more_page:
    print "Processing page: " + str(page)
    raw_data = urllib2.urlopen(api + str(page))
    json_data = json.load(raw_data)

    @@ -31,15 +36,35 @@
    # Get the info about each image on the page
    images = json_data["images"]

    # Update photo count
    photo_count = int(json_data["photo_count"])
    processed_photo_count += len(images)

    # Check if there is more page
    has_more_page = processed_photo_count < photo_count
    page += 1

    for item in images:
    file_id = item["short_id"]
    file_type = item["type"]
    file_time = time.mktime(time.strptime(item["timestamp"], "%Y-%m-%d %H:%M:%S"))
    file_url = "https://twitpic.com/show/full/"+file_id
    file_name = file_id + "." + file_type

    # Save the file
    urllib.urlretrieve (file_url, file_name)
    if not os.path.exists(file_name):
    # Remove temp file if exists
    try:
    os.remove(TMP_FILE_NAME)
    except OSError:
    pass

    # Set the file time
    os.utime(file_name,(file_time, file_time))
    # Save the file to temporary file
    urllib.urlretrieve (file_url, TMP_FILE_NAME)

    # Rename to actual file
    os.rename(TMP_FILE_NAME, file_name)

    # Set the file time
    os.utime(file_name,(file_time, file_time))
    else:
    print "Skipped image ID: " + file_id
  4. @hugs hugs revised this gist Sep 5, 2014. 1 changed file with 3 additions and 1 deletion.
    4 changes: 3 additions & 1 deletion archive-twitpic-data.py
    Original file line number Diff line number Diff line change
    @@ -2,6 +2,8 @@
    #
    # A cleaned-up fork of Terence Eden's original archiver:
    # http://shkspr.mobi/blog/2013/08/exporting-twitpic-images-python/
    #
    # License: MIT

    import urllib
    import urllib2
    @@ -33,7 +35,7 @@
    file_id = item["short_id"]
    file_type = item["type"]
    file_time = time.mktime(time.strptime(item["timestamp"], "%Y-%m-%d %H:%M:%S"))
    file_url = "http://twitpic.com/show/full/"+file_id
    file_url = "https://twitpic.com/show/full/"+file_id
    file_name = file_id + "." + file_type

    # Save the file
  5. @hugs hugs revised this gist Sep 5, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion archive-twitpic-data.py
    Original file line number Diff line number Diff line change
    @@ -33,7 +33,7 @@
    file_id = item["short_id"]
    file_type = item["type"]
    file_time = time.mktime(time.strptime(item["timestamp"], "%Y-%m-%d %H:%M:%S"))
    file_url = "https://twitpic.com/show/full/"+file_id
    file_url = "http://twitpic.com/show/full/"+file_id
    file_name = file_id + "." + file_type

    # Save the file
  6. @hugs hugs revised this gist Sep 5, 2014. 1 changed file with 4 additions and 4 deletions.
    8 changes: 4 additions & 4 deletions archive-twitpic-data.py
    Original file line number Diff line number Diff line change
    @@ -9,7 +9,7 @@
    import time
    import os

    USERNAME = 'your_username_goes_here'
    USERNAME = "your_username_goes_here"
    NUMBER_OF_PAGES_TO_DOWNLOAD = 5

    # Target Page
    @@ -22,18 +22,18 @@
    json_data = json.load(raw_data)

    # Save the page data
    page_file = open('page-%s.json' % page,'w')
    page_file = open("page-%s.json" % page,"w")
    page_file.write(json.dumps(json_data, indent=2))
    page_file.close()

    # Get the info about each image on the page
    images = json_data["images"]

    for item in images:
    file_id = item['short_id']
    file_id = item["short_id"]
    file_type = item["type"]
    file_time = time.mktime(time.strptime(item["timestamp"], "%Y-%m-%d %H:%M:%S"))
    file_url = "http://twitpic.com/show/full/"+file_id
    file_url = "https://twitpic.com/show/full/"+file_id
    file_name = file_id + "." + file_type

    # Save the file
  7. @hugs hugs revised this gist Sep 5, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion archive-twitpic-data.py
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,4 @@
    # Archive your TwitPic photos and metadata
    # Archive your Twitpic photos and metadata
    #
    # A cleaned-up fork of Terence Eden's original archiver:
    # http://shkspr.mobi/blog/2013/08/exporting-twitpic-images-python/
  8. @hugs hugs revised this gist Sep 5, 2014. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions archive-twitpic-data.py
    Original file line number Diff line number Diff line change
    @@ -21,6 +21,7 @@
    raw_data = urllib2.urlopen(api + str(page))
    json_data = json.load(raw_data)

    # Save the page data
    page_file = open('page-%s.json' % page,'w')
    page_file.write(json.dumps(json_data, indent=2))
    page_file.close()
  9. @hugs hugs revised this gist Sep 5, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion archive-twitpic-data.py
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,6 @@
    # Archive your TwitPic photos and metadata
    #
    # A cleaned-up fork of Terence Eden's original exporter:
    # A cleaned-up fork of Terence Eden's original archiver:
    # http://shkspr.mobi/blog/2013/08/exporting-twitpic-images-python/

    import urllib
  10. @hugs hugs revised this gist Sep 5, 2014. 1 changed file with 3 additions and 1 deletion.
    4 changes: 3 additions & 1 deletion archive-twitpic-data.py
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,7 @@
    # Archive your TwitPic photos and metadata
    # A fork of Terence Eden's http://shkspr.mobi/blog/2013/08/exporting-twitpic-images-python/
    #
    # A cleaned-up fork of Terence Eden's original exporter:
    # http://shkspr.mobi/blog/2013/08/exporting-twitpic-images-python/

    import urllib
    import urllib2
  11. @hugs hugs revised this gist Sep 5, 2014. 1 changed file with 0 additions and 2 deletions.
    2 changes: 0 additions & 2 deletions archive-twitpic-data.py
    Original file line number Diff line number Diff line change
    @@ -4,8 +4,6 @@
    import urllib
    import urllib2
    import json
    import collections
    import HTMLParser
    import time
    import os

  12. @hugs hugs created this gist Sep 5, 2014.
    42 changes: 42 additions & 0 deletions archive-twitpic-data.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,42 @@
    # Archive your TwitPic photos and metadata
    # A fork of Terence Eden's http://shkspr.mobi/blog/2013/08/exporting-twitpic-images-python/

    import urllib
    import urllib2
    import json
    import collections
    import HTMLParser
    import time
    import os

    USERNAME = 'your_username_goes_here'
    NUMBER_OF_PAGES_TO_DOWNLOAD = 5

    # Target Page
    api = "https://api.twitpic.com/2/users/show.json?username=%s&page=" % USERNAME

    # Get the data about the target page
    for page in range(1, NUMBER_OF_PAGES_TO_DOWNLOAD+1):
    print page
    raw_data = urllib2.urlopen(api + str(page))
    json_data = json.load(raw_data)

    page_file = open('page-%s.json' % page,'w')
    page_file.write(json.dumps(json_data, indent=2))
    page_file.close()

    # Get the info about each image on the page
    images = json_data["images"]

    for item in images:
    file_id = item['short_id']
    file_type = item["type"]
    file_time = time.mktime(time.strptime(item["timestamp"], "%Y-%m-%d %H:%M:%S"))
    file_url = "http://twitpic.com/show/full/"+file_id
    file_name = file_id + "." + file_type

    # Save the file
    urllib.urlretrieve (file_url, file_name)

    # Set the file time
    os.utime(file_name,(file_time, file_time))