Alternative versions: LabNotebook, nbviewer
Overview¶
The ABIDE preprocessed connectome project (PCP) data can be downloaded programatically with a little hackery. This helps to keep storage space to a minimum as we can indicate with complete specificity which files we want, and download and keep (or not keep; just use temporarily) those only. Whilst there are some scripts and apis already out there that greatly finesse the process of grabbing data from the database, there are a few specific functionalities lacking in these that made them insufficient for my own needs. The following notes are therefore a simple set of recipes for grabbing ABIDE PCP data in a way that matched my requirements. I believe it should also generalize to the other non-abide PCP databases also.
Notebook Setup¶
Define some variables
# define system-specific filepaths etc
%run ~/set_localenv_vars.py
# output dir
outdir = le['data_dir'] + '/notebooks/downloading_abide_pcp_data'
!mkdir -p $outdir
nb_name = 'downloading_abide_pcp_data'
# stuff for analyses
s3_prefix = 'https://s3.amazonaws.com/fcp-indi/data/Projects/'\
'ABIDE_Initiative'
abide_dir = le['data_dir'] + '/PCP/ABIDE/downloaded'
# stuff for workdocs-cloudfiles
aws_key = 'drowssaperucesyreva'
aws_secret = '?teytidettopsuoyevah'
Importage
# Generic imports
from IPython.display import Image,display as d
from copy import deepcopy
import os,glob,sys
import numpy as np
import pandas as pd
import urllib
# workdocs-cloudfiles stuff
sys.path.append(le['ipynb_workdocs_dir'])
from new_utils import nb_fig,cloudfiles_nb
Initialize workdocs-cloudfiles folder
cnb = cloudfiles_nb('aws', [aws_key,aws_secret])
cnb.initialize_folder(nb_name)
Load calico document extensions
%%javascript
IPython.load_extensions('calico-spell-check', 'calico-document-tools',
'calico-cell-tools');
Go to output folder
os.chdir(outdir)
Ok, let's get cracking...
Get metadata file¶
md_file = 'Phenotypic_V1_0b_preprocessed1.csv'
url = s3_prefix + '/' + md_file
out_file = abide_dir + '/' + md_file
urllib.urlretrieve(url,out_file);
Take a look at metadata file
df_md = pd.read_csv(out_file)
cols = ['subject', 'SUB_ID', 'FILE_ID', 'AGE_AT_SCAN','SEX', 'DX_GROUP']
d(df_md[cols].ix[0:5])
Download example with one subject¶
site = 'Caltech'
sub = '0051456'
Get CPAC pipeline output¶
pipeline = 'cpac'
strategy = 'filt_global'
derivative = 'func_preproc'
suffix = '.nii.gz'
path = '/Outputs/%s/%s/%s/%s_%s_%s%s' %(pipeline,strategy, derivative,
site,sub, derivative, suffix)
d(path)
'/Outputs/cpac/filt_global/func_preproc/Caltech_0051456_func_preproc.nii.gz'
out_file = abide_dir + path
out_folder = os.path.dirname(out_file)
if not os.path.isdir(out_folder): os.makedirs(out_folder)
This is where it's coming from
url = s3_prefix + path
d(url)
'https://s3.amazonaws.com/fcp-indi/data/Projects/ABIDE_Initiative/Outputs/cpac/filt_global/func_preproc/Caltech_0051456_func_preproc.nii.gz'
This is where it's going
d(out_file)
'/alexandra/mcintosh_lab/john/Data/PCP/ABIDE/downloaded/Outputs/cpac/filt_global/func_preproc/Caltech_0051456_func_preproc.nii.gz'
Now get it.
urllib.urlretrieve(url,out_file);
Check that file has arrived
ls $out_folder/
Get freesurfer output¶
fs_folder = 'surf'
fs_filename = 'lh.orig'
path = '/Outputs/freesurfer/5.1/%s_%s/%s/%s' %(site,sub,fs_folder, fs_filename)
out_file = abide_dir + path
out_folder = os.path.dirname(out_file)
if not os.path.isdir(out_folder): os.makedirs(out_folder)
This is where it's coming from
url = s3_prefix + path
d(url)
'https://s3.amazonaws.com/fcp-indi/data/Projects/ABIDE_Initiative/Outputs/freesurfer/5.1/Caltech_0051456/surf/lh.orig'
This is where it's going
d(out_file)
'/alexandra/mcintosh_lab/john/Data/PCP/ABIDE/downloaded/Outputs/freesurfer/5.1/Caltech_0051456/surf/lh.orig'
Do it
urllib.urlretrieve(url,out_file);
Check for outputs
ls $out_folder/
Now do the above for all the freesurfer files we need
# surf folder
surf_files = ['lh.orig', 'rh.orig', 'lh.white', 'rh.white', 'lh.sphere' ,'rh.sphere',
'lh.thickness', 'rh.thickness']
# mri folder
mri_files = ['brain.mgz','brainmask.mgz', 'orig.mgz']
fs_paths = []
for f in surf_files: fs_paths.append('surf/%s' %f)
for f in mri_files: fs_paths.append('mri/%s' %f)
These are the files
d(fs_paths)
['surf/lh.orig', 'surf/rh.orig', 'surf/lh.white', 'surf/rh.white', 'surf/lh.sphere', 'surf/rh.sphere', 'surf/lh.thickness', 'surf/rh.thickness', 'mri/brain.mgz', 'mri/brainmask.mgz', 'mri/orig.mgz']
Grab em
for f in fs_paths:
path = '/Outputs/freesurfer/5.1/%s_%s/%s' %(site,sub,f)
url = s3_prefix + path
out_file = abide_dir + path
out_folder = os.path.dirname(out_file)
if not os.path.isdir(out_folder):
os.makedirs(out_folder)
urllib.urlretrieve(url,out_file);
Several subjects and folders¶
Ok, this should be clear enough now.
In practice, in general, we will probably want to be pulling entire groups in one go. This would be done with something like the following:
pipeline = 'cpac'
derivative = 'func_preproc'
suffix = '.nii.gz'
subs = ['0051456', '0051457']
strategies = ['filt_global', 'filt_noglobal', 'nofilt_global', 'nofilt_noglobal']
for sub in subs:
# Get functional data
for strategy in strategies:
path = '/Outputs/%s/%s/%s/%s_%s_%s%s' %(pipeline,strategy, derivative,
site,sub, derivative, suffix)
url = s3_prefix + path
out_file = abide_dir + path
out_folder = os.path.dirname(out_file)
if not os.path.isdir(out_folder):
os.makedirs(out_folder)
if not os.path.isfile(out_file):
urllib.urlretrieve(url,out_file);
#!wget $url -O $out_file # (alternative to urllib command)
# Get freesurfer data
for f in fs_paths:
path = '/Outputs/freesurfer/5.1/%s_%s/%s' %(site,sub,f)
url = s3_prefix + path
out_file = abide_dir + path
out_folder = os.path.dirname(out_file)
if not os.path.isdir(out_folder):
os.makedirs(out_folder)
if not os.path.isfile(out_file):
urllib.urlretrieve(url,out_file);
#!wget $url -O $out_file # (alternative to urllib command)
Ok, that's a wrap.