Use this dataset from the US Census Bureau:
http://www2.census.gov/geo/docs/reference/county_adjacency.txt
!wget http://www2.census.gov/geo/docs/reference/county_adjacency.txt -O county_adjacency.txt
It's a tab-separated value format where duplicated entries within a column have been removed with blanks.
!head county_adjacency.txt
Store them in a graph (dict of set).
with open("county_adjacency.txt") as f:
tups = [tuple(line.rstrip().split("\t")) for line in f.readlines()]
graph = {}
for t in tups:
if t[0]:
a, b = t[0][1:-1], t[2][1:-1]
else:
b = t[2][1:-1]
if a == b: continue
if not a.endswith("CA") or not b.endswith("CA"): continue
if a not in graph:
graph[a] = set()
graph[a].add(b)
print len(graph), "total counties"
First we need to give each county a short identifier.
import re
idents = {name:
re.sub("[^a-z]", "_", name.replace("County, CA","").lower().strip())
for name in graph}
print list(idents.items())[:10]
with open("counties.dot", "w") as f:
f.write("graph {\n")
f.write(" santa_cruz [color=red];\n")
for a, bs in graph.items():
for b in bs:
if a < b:
f.write(" %s -- %s;\n" % (idents[a], idents[b]))
f.write("}\n")
!head counties.dot
%%time
!fdp counties.dot -Tpng -o counties.png
from IPython.display import Image
Image("counties.png")