Alternative versions: LabNotebook, nbviewer

Overview¶

The ABIDE preprocessed connectome project (PCP) data can be downloaded programatically with a little hackery. This helps to keep storage space to a minimum as we can indicate with complete specificity which files we want, and download and keep (or not keep; just use temporarily) those only. Whilst there are some scripts and apis already out there that greatly finesse the process of grabbing data from the database, there are a few specific functionalities lacking in these that made them insufficient for my own needs. The following notes are therefore a simple set of recipes for grabbing ABIDE PCP data in a way that matched my requirements. I believe it should also generalize to the other non-abide PCP databases also.

Notebook Setup¶

Define some variables

# define system-specific filepaths etc
%run ~/set_localenv_vars.py

# output dir
outdir = le['data_dir'] + '/notebooks/downloading_abide_pcp_data'
!mkdir -p $outdir


nb_name = 'downloading_abide_pcp_data'


# stuff for analyses

s3_prefix = 'https://s3.amazonaws.com/fcp-indi/data/Projects/'\
            'ABIDE_Initiative'
    
abide_dir = le['data_dir'] + '/PCP/ABIDE/downloaded'


# stuff for workdocs-cloudfiles

aws_key = 'drowssaperucesyreva'
aws_secret = '?teytidettopsuoyevah'

Importage

# Generic imports

from IPython.display import Image,display as d

from copy import deepcopy

import os,glob,sys

import numpy as np
import pandas as pd

import urllib


# workdocs-cloudfiles stuff

sys.path.append(le['ipynb_workdocs_dir'])
from new_utils import nb_fig,cloudfiles_nb

Initialize workdocs-cloudfiles folder

cnb = cloudfiles_nb('aws', [aws_key,aws_secret])
cnb.initialize_folder(nb_name)

Load calico document extensions

%%javascript
IPython.load_extensions('calico-spell-check', 'calico-document-tools', 
                        'calico-cell-tools');

Go to output folder

os.chdir(outdir)

Ok, let's get cracking...

Get metadata file¶

md_file = 'Phenotypic_V1_0b_preprocessed1.csv'
url = s3_prefix + '/' + md_file
out_file = abide_dir + '/' + md_file
urllib.urlretrieve(url,out_file);

Take a look at metadata file

df_md = pd.read_csv(out_file)

cols = ['subject', 'SUB_ID', 'FILE_ID', 'AGE_AT_SCAN','SEX', 'DX_GROUP']
d(df_md[cols].ix[0:5])

	subject	SUB_ID	FILE_ID	AGE_AT_SCAN	SEX	DX_GROUP
0	50002	50002	no_filename	16.77	1	1
1	50003	50003	Pitt_0050003	24.45	1	1
2	50004	50004	Pitt_0050004	19.09	1	1
3	50005	50005	Pitt_0050005	13.73	2	1
4	50006	50006	Pitt_0050006	13.37	1	1
5	50007	50007	Pitt_0050007	17.78	1	1

Download example with one subject¶

site = 'Caltech'
sub = '0051456'

Get CPAC pipeline output¶

pipeline = 'cpac'
strategy = 'filt_global'
derivative = 'func_preproc'
suffix = '.nii.gz'
path = '/Outputs/%s/%s/%s/%s_%s_%s%s' %(pipeline,strategy, derivative,
                                        site,sub, derivative, suffix)
d(path)

'/Outputs/cpac/filt_global/func_preproc/Caltech_0051456_func_preproc.nii.gz'

out_file = abide_dir + path
out_folder = os.path.dirname(out_file)
if not os.path.isdir(out_folder): os.makedirs(out_folder)

This is where it's coming from

url = s3_prefix + path
d(url)

'https://s3.amazonaws.com/fcp-indi/data/Projects/ABIDE_Initiative/Outputs/cpac/filt_global/func_preproc/Caltech_0051456_func_preproc.nii.gz'

This is where it's going

d(out_file)

'/alexandra/mcintosh_lab/john/Data/PCP/ABIDE/downloaded/Outputs/cpac/filt_global/func_preproc/Caltech_0051456_func_preproc.nii.gz'

Now get it.

urllib.urlretrieve(url,out_file);

Check that file has arrived

ls $out_folder/

Caltech_0051456_func_preproc.nii.gz  Caltech_0051457_func_preproc.nii.gz

Get freesurfer output¶

fs_folder = 'surf'
fs_filename = 'lh.orig'
path = '/Outputs/freesurfer/5.1/%s_%s/%s/%s' %(site,sub,fs_folder, fs_filename)

out_file = abide_dir + path
out_folder = os.path.dirname(out_file)
if not os.path.isdir(out_folder): os.makedirs(out_folder)

This is where it's coming from

url = s3_prefix + path
d(url)

'https://s3.amazonaws.com/fcp-indi/data/Projects/ABIDE_Initiative/Outputs/freesurfer/5.1/Caltech_0051456/surf/lh.orig'

This is where it's going

d(out_file)

'/alexandra/mcintosh_lab/john/Data/PCP/ABIDE/downloaded/Outputs/freesurfer/5.1/Caltech_0051456/surf/lh.orig'

Do it

urllib.urlretrieve(url,out_file);

Check for outputs

ls $out_folder/

lh.orig    lh.thickness  rh.orig    rh.thickness
lh.sphere  lh.white      rh.sphere  rh.white

Now do the above for all the freesurfer files we need

# surf folder
surf_files =  ['lh.orig', 'rh.orig', 'lh.white', 'rh.white', 'lh.sphere' ,'rh.sphere',
               'lh.thickness', 'rh.thickness']
    
# mri folder
mri_files = ['brain.mgz','brainmask.mgz', 'orig.mgz']
    
fs_paths = []
for f in surf_files: fs_paths.append('surf/%s' %f)
for f in mri_files: fs_paths.append('mri/%s' %f)

These are the files

d(fs_paths)

['surf/lh.orig',
 'surf/rh.orig',
 'surf/lh.white',
 'surf/rh.white',
 'surf/lh.sphere',
 'surf/rh.sphere',
 'surf/lh.thickness',
 'surf/rh.thickness',
 'mri/brain.mgz',
 'mri/brainmask.mgz',
 'mri/orig.mgz']

Grab em

for f in fs_paths: 
  path = '/Outputs/freesurfer/5.1/%s_%s/%s' %(site,sub,f)  
  url = s3_prefix + path
  out_file = abide_dir + path
  out_folder = os.path.dirname(out_file)
  if not os.path.isdir(out_folder):
    os.makedirs(out_folder)    
  urllib.urlretrieve(url,out_file);

Several subjects and folders¶

Ok, this should be clear enough now.

In practice, in general, we will probably want to be pulling entire groups in one go. This would be done with something like the following:

pipeline = 'cpac'
derivative = 'func_preproc'
suffix = '.nii.gz'

subs = ['0051456', '0051457']
strategies = ['filt_global', 'filt_noglobal', 'nofilt_global', 'nofilt_noglobal']

for sub in subs:
    
  # Get functional data
    
  for strategy in strategies: 
    path = '/Outputs/%s/%s/%s/%s_%s_%s%s' %(pipeline,strategy, derivative,
                                            site,sub, derivative, suffix)
    
    url = s3_prefix + path
    out_file = abide_dir + path
    out_folder = os.path.dirname(out_file)
    if not os.path.isdir(out_folder):
      os.makedirs(out_folder)    
    if not os.path.isfile(out_file):
      urllib.urlretrieve(url,out_file);
      #!wget $url -O $out_file # (alternative to urllib command)

  # Get freesurfer data

  for f in fs_paths: 
    path = '/Outputs/freesurfer/5.1/%s_%s/%s' %(site,sub,f)  
    url = s3_prefix + path
    out_file = abide_dir + path
    out_folder = os.path.dirname(out_file)
    if not os.path.isdir(out_folder):
      os.makedirs(out_folder)    
    if not os.path.isfile(out_file):
      urllib.urlretrieve(url,out_file);
      #!wget $url -O $out_file # (alternative to urllib command)

Ok, that's a wrap.

JDG Lab Notebook

Downloading ABIDE PCP data

Contents