Source code for adnipy.adni

# -*- coding: utf-8 -*-

"""Pandas dataframe extension for ADNI."""

# pylint: disable=R0914

# Third party imports
import pandas as pd


[docs]@pd.api.extensions.register_dataframe_accessor("adni") class ADNI: """Dataframe deals with ADNI data. This class presents methods, which are designed to work with data from the ADNI database. """ DATES = [ # Collections "Acq Date", "Downloaded", # ADNIMERGE "EXAMDATE", "EXAMDATE_bl", "update_stamp", # DESIKANLAB "USERDATE", "update_stamp", # TAUMETA "USERDATE", "USERDATE2", "SCANDATE", "TAUTRANDT", "update_stamp", # TAUMETA3 "USERDATE", "USERDATE2", "SCANDATE", "TRANDATE", "update_stamp", ] INDEX = ["Subject ID", "Image ID"] MAPPER = { # Collections "Image": "Image ID", "Image Data ID": "Image ID", "Subject": "Subject ID", "Acq Date": "SCANDATE", # ADNIMERGE "PTID": "Subject ID", # TAUMETA3 "ASSAYTIME": "TAUTIME", } def __init__(self, pandas_dataframe): """Pass dataframe to the _df attribute of ADNI object. Parameters ---------- pandas_dataframe : pd.DataFrame This dataframe will be stored in the _df attribute. Attributes ---------- _df : pd.DataFrame This represents the dataframe object, which calls the method. """ self._df = pandas_dataframe
[docs] def standard_column_names(self): """Rename dataframe columns to module standard. This function helps when working with multiple dataframes, since the same data can have different names. It will also call `rid()` on the dataframe. Returns ------- pd.DataFrame This will have standardized columns names. See Also -------- rid Examples -------- >>> subjects = pd.DataFrame({"Subject": ["101_S_1001", "102_S_1002"]}) >>> subjects Subject 0 101_S_1001 1 102_S_1002 >>> subjects.adni.standard_column_names() "VISCODE2" not included. Subject ID RID 0 101_S_1001 1001 1 102_S_1002 1002 >>> images = pd.DataFrame({"Image": [100001, 100002]}) >>> images Image 0 100001 1 100002 >>> images.adni.standard_column_names() "VISCODE2" not included. Image ID 0 100001 1 100002 """ self._df = self._df.rename(mapper=self.MAPPER, axis="columns") if "VISCODE2" in self._df.columns: self._df["VISCODE"] = self._df["VISCODE2"] del self._df["VISCODE2"] else: print('"VISCODE2" not included.') self._df = self.rid() return self._df
[docs] def standard_dates(self): """Change type of date columns to datetime. Returns ------- pd.DataFrame Dates will have the appropriate dtype. """ for date in self.DATES: if date in self._df.columns: self._df.loc[:, date] = pd.to_datetime(self._df.loc[:, date]) return self._df
[docs] def standard_index(self, index=None): """Process dataframes into a standardized format. The output is easy to read. Applying functions the the output may not work as expected. Parameters ---------- index : list of str, default None These columns will be the new index. Returns ------- pd.DataFrame An easy to read dataframe for humans. """ if index is None: index = ["Subject ID", "Image ID", "RID", "Visit", "SCANDATE"] dataframe = self._df.reset_index() dataframe = dataframe.set_index( [column for column in index if column in dataframe.columns] ) if "index" in dataframe.columns: dataframe = dataframe.drop(columns="index") dataframe = dataframe.dropna(axis="columns", how="all") dataframe = dataframe.sort_index() return dataframe
[docs] def rid(self): """Add a roster ID column. Will not work if 'RID' is already present or 'Subject ID' is missing. Returns ------- pd.DataFrame Dataframe with a 'RID' column. Examples -------- >>> subjects = {"Subject ID": ["100_S_1000", "101_S_1001"]} >>> collection = pd.DataFrame(subjects) >>> collection Subject ID 0 100_S_1000 1 101_S_1001 >>> collection.adni.rid() Subject ID RID 0 100_S_1000 1000 1 101_S_1001 1001 """ collection = self._df missing_rid = "RID" not in collection.columns contains_subject_id = "Subject ID" in collection.columns if missing_rid and contains_subject_id: collection["RID"] = collection["Subject ID"].map( lambda subject_id: pd.to_numeric(subject_id[-4:]) ) return collection
[docs] def drop_dynamic(self): """Remove images which are dynamic. Drops all rows, in which the Description contains 'Dynamic'. Returns ------- pd.DataFrame All images that are not dynamic. """ no_dynamic = self._df[~self._df["Description"].str.contains("Dynamic")] return no_dynamic
[docs] def groups(self, grouped_mci=True): """Create a dataframe for each group and save it to a csv file. Parameters ---------- grouped_mci : bool, default True If true, 'LMCI' and 'EMCI' are treated like 'MCI'. However, the original values will stills be in the output. Returns ------- dict Dictionnairy with a dataframe for each group. """ collection = self._df # creates dataframe for each group group_names = collection["Group"].unique() groups = {} for group in group_names: group_df = collection[collection["Group"] == group] groups[group] = group_df # groups MCIs if grouped_mci is True: mci = collection[collection["Group"].isin(["MCI", "LMCI", "EMCI"])] if not mci.empty: groups["MCI"] = mci if "LMCI" in group_names: del groups["LMCI"] if "EMCI" in group_names: del groups["EMCI"] return groups
[docs] def longitudinal(self): """ Keep only longitudinal data. This requires an 'RID' or 'Subject ID' column in the dataframe. Do not use if multiple images are present for a single timepoint. Parameters ---------- images : pd.DataFrame This dataframe will be modified. Returns ------- pd.DataFrame A dataframe with only longitudinal data. See Also -------- drop_dynamic """ images = self.rid() longitudinal = images[images["RID"].duplicated(keep=False)] return longitudinal
[docs] def timepoints(self, second="first"): """Extract timepoints from a dataframe. Parameters ---------- second : {'first' or 'last'}, default 'first' 'last' to have the latest, 'first' to have the earliest values for timepoint 2. """ dataframe = self._df dataframe.reset_index(inplace=True) dataframe.set_index(self.INDEX, inplace=True) dataframe.sort_index(inplace=True) if "index" in dataframe.columns: dataframe = dataframe.drop(columns="index") if "Description" in dataframe.columns: raise ValueError( "Make sure that 'Description' is not in columns " "and only one image per timepoint is in the pd.DataFrame." ) df_subjects = dataframe.index.get_level_values(0) df_images = dataframe.index.get_level_values(1) timepoints = {} if second == "first": total_timepoints = max(df_subjects.value_counts()) for i in range(total_timepoints): timepoint = i + 1 timepoint_df = dataframe[~df_subjects.duplicated(keep="first")] timepoint_str = "Timepoint " + str(timepoint) timepoints[timepoint_str] = timepoint_df dataframe = dataframe[ ~df_images.isin(timepoint_df.index.get_level_values(1)) ] df_subjects = dataframe.index.get_level_values(0) df_images = dataframe.index.get_level_values(1) elif second == "last": timepoint_1 = dataframe[~df_subjects.duplicated()] timepoints["Timepoint 1"] = timepoint_1 timepoint_1_images = timepoint_1.index.get_level_values(1) after_timepoint_1 = dataframe[~df_images.isin(timepoint_1_images)] after_tp_1_images = after_timepoint_1.index.get_level_values(0) timepoint_2_last = after_timepoint_1[ ~after_tp_1_images.duplicated(keep="last") ] timepoints["Timepoint 2"] = timepoint_2_last return timepoints