In my current job, I needed to find some avatars for a bunch of web pages, but bigger than FavIcon size. I was at a loss until I remembered that Facebook has avatars in a whole bunch of sizes available by API. I decided to write a Python app using the OpenGraph API.

Pass the script a text file containing tab-seperated values of the website name and domain, and it will do a search for the name, and download the avatar with a name as www_domain_com.png

?View Code PYTHON
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# !/usr/bin/python
# Copyright 2012 Mark Perdomo
 
 
import os
import re
import sys
import urllib
from urlparse import urlparse
import csv
import time
import json
 
"""Facebook Avatar Downloader
This program, given a list of URLs, searches
for the Facebook UID of the FQDN and downloads
the avatar.
"""
 
def read_urls(filename):
    """Returns a list of titles and FQDNs from a
    file full of URLs.  Screens out duplicates
    and returns the titles and URLs in
    increasing order as a tuple."""
 
    url_list = []
 
    #CSV library allows us to quickly access tab-seperated values
    with open(filename) as tsv:
        for line in csv.reader(tsv, dialect="excel-tab"):
            # Use urlparse to cut up any urls
            try:
                url = line[1]
                o = urlparse(url)
                match = o.netloc
                #If there is a valid URL, append the title/FQDN tuple
                if len(match) > 1:
                    url_list.append((line[0],match))
            except:
                print line + '--> NOT MATCHED!'
 
    return sorted(url_list)
 
 
def lookup_uid(term):
    """Does a FB opengraph search for a supplied
    string and returns the first Facebook UID in
    search"""
 
    data = []
    search_url_head = 'https://graph.facebook.com/search?q='
    search_url_tail = '=&type=page'
    search_url = search_url_head + term + search_url_tail
 
    response = json.load(urllib.urlopen(search_url))
    if response:
        try:
           return response['data'][0]['id']
        except:
           print 'Bad JSON response'
           return 'null'
    else:
        print 'No response'
        return 'null'
 
 
def download_image(item, dest_dir, file_name):
    """Constructs an FB OpenGraph query and downloads
    the images to the directory.  Provide a tuple with
    (FQDN,Facebook uid) and the destination directory"""
 
    url_head = 'https://graph.facebook.com/'
    url_tail = '/picture?type=large'
    img_type = '.png'
    download_url = url_head + item + url_tail
 
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
 
    if item != 'null':
        print 'Retrieving...', download_url
        urllib.urlretrieve(download_url, os.path.join(dest_dir, file_name))
    else:
        print 'There was a problem getting ', file_name
 
def check_missing(site_list, dest_dir):
    """This function double-checks your work and tells
    you if a domain does not have an image"""
    error = []
    for item in site_list:
        file_name = re.sub('\.','_',item[1]) + '.png'
        if not os.path.exists(os.path.join(dest_dir, file_name)):
            error.append(item[1])
    if error:
        print '\n\nNo image for these domains:'
        for url in error:
            print url
    else:
        print '\n\nAll domains accounted for'
 
def main():
    args = sys.argv[1:]
 
    if not args:
        print 'Please specify a file input'
        sys.exit(1)
 
    todir = ''
    if args[0] == '--todir':
        todir = args[1]
        del args[0:2]
 
    items = read_urls(args[0])
 
 
    for item in items:
        site_title = item[0]
        site_url = item[1]
        file_name =  re.sub('\.','_', site_url) + '.png'
        if  not os.path.exists(os.path.join(todir, file_name)):
            site_id = lookup_uid(site_title)
            download_image(site_id, todir, file_name)
            time.sleep(.75)
 
 
    check_missing(items, todir)
 
if __name__ == '__main__':
    main()

File input is a txt file that looks like this:
Title Root Domain
ABC News http://abcnews.go.com/
Al Jazeera http://www.aljazeera.com/
Al-Monitor http://www.al-monitor.com/
The Atlantic http://www.theatlantic.com/
The Atlantic Politics http://www.theatlantic.com/politics/
The Atlantic Wire http://www.theatlanticwire.com/

The code uses the FQDN as a "unique identifier" so when running the script multiple times, you won't generate too many API calls. Also, I throttle the application using time.sleep(). This script saved me a lot of time, and now I only have to sort out bad matches and missing domains.