Source code for current_flow_allostery.bootstrap_betweenness_ks

try:
    import argparse
except ImportError:
    raise ImportError("require argparse")

#numerical / data packages
try:
    import numpy as np
    np.set_printoptions(threshold=10)
except ImportError:
    raise ImportError("require numpy")
try:
    import pandas as pd
except ImportError:
    raise ImportError("require pandas")
try:
    import scipy
    import scipy.sparse
    import scipy as sp
    import scipy.stats
except ImportError:
    raise ImportError("require scipy (NOTE: check scipy submodules)")

#utilities
import sys
import gc
import time

##others


#self defined functions
if __name__ == "__main__":
    from functions import database_mod as db_m
else:
    from .functions import database_mod as db_m

if __name__ == "__main__":
    parser=argparse.ArgumentParser(description=
        "Loads the specified GB interaction network and calculates the corresponding flow betweenness network")
    
    parser.add_argument(
        '-dbp','--databasePath',dest='databasePath',
        help='Path to the directory containing the interaction network file. (required)'
    )
    
    parser.add_argument(
         '-odbp','--outputDatabase',default=None,
        help='This parameter controls the path of the database that will be used for output'+\
             'By default, the database being read from will also be written to for output.'
    )
    
    parser.add_argument(
         '-qs','--querySQL',default='SELECT *; FROM Networks',
         dest='querySQL',
        help='SQL string to load the edge data from the database.'+\
             '\nDefaluts to "SELECT *; FROM Networks" to select all data from'+\
             'the "Networks" table'
    )
    parser.add_argument(
        '-sgc','--systemGroupColumn',default='system',
        dest='systemGroupColumn',
        help='Name of the column used to group different systems to be tested.'+\
             '\nIf this occurs over multiple columns in the database, those columns'+\
             '\nwill need to be merged into a single column using the "querySQL" query'+\
             '\nand then the appropriate merged name(s) entered here.'+\
             '\nE.g. if you have a system and variant column and your reference "system"'+\
             '\nis defined as system="wt" and variant="standard", and you also need the'+\
             '\nSeqid_1,Seqid_2,Chain_Delta, and Betweenness columns for grouping / testing.'+\
             '\nthen you would use:'+\
             '\n --querySQL "SELECT system||variant AS System,Seqid_1,Seqid_2,Chain_Delta,Betweenness'+\
             '\n             FROM Networks"'+\
             '\n -sgc System -referenceSystems wtstandard. Default is "system"'
    )
    parser.add_argument(
        '-igc','--interactionGroupColumns',nargs='*',
        default=['Seqid_1','Seqid_2','Chain_Delta'],
        dest='interactionGroupColumns',
        help='Names of columns used to group individual interactions to test.'+\
             '\nDefault is "Seqid_1" "Seqid_2" "Chain_Delta"'
    )
    parser.add_argument(
        '-rs','--referenceSystems',nargs='*',
        default=['wt2'],dest='referenceSystems',
        help='List of systems to use as reference systems.'+\
             '\nThe test systems will then be the set of all other systems.'
             '\nBootstrapping is performed over all pairs of reference and test systems'+\
             '\nunless the "testAllPairs" flag is set, in which case this parameter does nothing.'+\
             '\nDefault is "wt2"'
    )
    parser.add_argument(
        '-vc','--valueColumn',default='Betweenness',dest='valueColumn',
        help='Name of the column containing the values describing the distributions to be tested.'
    )
    parser.add_argument(
        '-al','--alphas',nargs='*',default=[.1],dest='alphas',
        help='List of alpha values to be considered (where alpha = 1 - confidence level).'+\
             '\n(Note: the minimum alpha value determines the number of bootstrap samples'+\
             '\nbased on 1/(alpha/2)^2, with a minimum of 64 bootstrap samples regardless'+\
             '\nof the minimum alpha. So setting alpha smaller will require more time and memory)'
    )
    parser.add_argument(
        '-tap','--testAllPairs',nargs='?',default=False,const=True,
        dest='testAllPairs',
        help='If this flag is set, all pairs of systems will be tested. (not yet implemented)'
    )
    
    parser.add_argument(
        '-obn','--outputBase',default='Edge_Betweenness_KS',
        dest='outputBase',
        help='Base name for tables to be added to the database. If the "outputToCSV" flag'+\
             '\nis set, then this will serve as the base name of the csv datafiles instead.'+\
             '\nDefault is "Edge_Betweenness_KS". If writting to csv files, '+\
             '\nthis base name should include the entire path to where you want to save the files'
    )
    
    parser.add_argument(
        '-ocf','--outputToCSV',nargs='?',default=False,const=True,
        dest='outputToCSV',
        help='If set, csv files will be written instead of adding tables to the database'
    )
    
    parser.add_argument(
        '-wbd','--writeBootstrapDistributions',nargs='?',default=False,const=True,
        dest='writeBootstrapDistributions',
        help='If this flag is set the KS value distributions generated by bootstrapping'+\
             'will be written to tables (or csv files if the "outputToCSV" flag is set)'
    )
    
    parser.add_argument(
        '-fgc','--flushGroupCount',default=1,dest='flushGroupCount',
        help='Number of interaction groups to run before writting output.'+\
             '\nUseful for tunning performance. Generally, total time spent'+\
             '\nwritting will be lower when more groups are flushed at once,'+\
             '\nespecially when output is being written to a database'
    )
    
    parser.add_argument(
        '-dryrun',nargs='?',const=True,default=False,dest='dryrun',
        help='Dont run anything, jsut print out input argument namespace and end program'
    )
    
    parser.add_argument(
        '-v','--verbose',nargs='?',const=True,default=False,dest='verbose',
        help='controls printing of progress / information to stdout during run'
    )
    parser.add_argument(
        '-vl','--verboseLevel',default=0,dest='verboseLevel',
        help='when verbose flag is given, controls the amount of detail printed'+\
             '0: Only write basic progress messages'+\
             '\n1: Also show timing and progress percentages'+\
             '\n2: Write additional detail, such as input arguments and data statistics / headers'+\
             '\n3: Echo back all sql commands being run'
    )
    
    args=parser.parse_args()
    
    if args.verbose or args.dryrun:
        print('Input arguments:',args)
    if not args.dryrun:
        verbose=args.verbose
        verboseLevel=int(args.verboseLevel)
        
        if verbose and verboseLevel>0:
            ti=time.time()
            
        if not args.outputDatabase is None:
            outputDatabase=args.outputDatabase
        else:
            outputDatabase=args.databasePath
        
        if verbose:
            print("Starting up")
        #transcribe input arguments to variables
        databasePath=args.databasePath
        outputBase=args.outputBase
        writeBootstrapDistributions=args.writeBootstrapDistributions
        outputToCSV=args.outputToCSV
        interactionGroupColumns=args.interactionGroupColumns
        systemGroupColumn=args.systemGroupColumn
        referenceSystems=args.referenceSystems
        querySQL=args.querySQL
        flushGroupCount=int(args.flushGroupCount)
        valueColumn=args.valueColumn
        alphas=np.sort(np.unique(np.array(list(map(float,args.alphas)))))
    
#        ### This code adapted from ###
#        ### https://writeonly.wordpress.com/2009/07/16/simple-read-only-sqlalchemy-sessions/
#        ### used to ensure input SQL query string will effectively not have write access
#        def abort_ro(*args,**kwargs):
#            ''' the terrible consequences for trying 
#                to flush to the db '''
#            print("No writing allowed, tsk!  We're telling mom!")
#            return 
#        
#        def db_setup(connstring='sqlite:///'+databasePath,
#                     readOnly=True,echo=(verbose and (verboseLevel > 2))):
#            engine = create_engine(connstring, echo=echo)
#            Session = sessionmaker(
#                bind=engine, 
#                autoflush=not readOnly, 
#                autocommit=not readOnly
#            )
#            session = Session()
#            
#            if readOnly:
#                session.flush = abort_ro   # now it won't flush!
#    
#            return session, engine
#        ### ### ###
#        
#        readSession, readEngine=db_setup()
#        if not (args.outputDatabase is None):
#            writeSession, writeEngine = db_setup(
#                connstring='sqlite:///'+outputDatabase,readOnly=False)
#        else:
#            writeSession,writeEngine=db_setup(readOnly=False)
   
        ##### 
        connstring='sqlite:///'+databasePath
        
        echo=( verbose and (verboseLevel > 2))
         
        alphas=np.sort(np.unique(np.array(list(alphas))))
        readSession, readEngine= db_m.db_setup(connstring,True,echo)
        if not (outputDatabase is None):
            connstring='sqlite:///'+outputDatabase
            writeSession, writeEngine = db_m.db_setup(
                connstring,False,echo)
        else:
            writeSession,writeEngine= db_m.db_setup(connstring,False,echo)
        #####        
        
        if verbose:
            print("Loading data using input SQL query")
            sys.stdout.flush()
            if verboseLevel>0:
                t1=time.time()
        
        query = readSession.query(querySQL)
        networkData = pd.read_sql(query.statement,readEngine)
        if len(networkData)==0:
            print('Input query yielded no data! Exiting')
        else:
            if verbose:
                if verboseLevel>0:
                    t2=time.time()
                    print('-Input data query time: %.2f seconds'%(t2-t1))
                    t1=time.time()
                    if verboseLevel>1:
                        print('-- Data Extracted From Query --')
                        print(networkData.head())
                        print(networkData.describe())
                        print('-- -- --')
                print('Splitting Reference and Test System Data')
                sys.stdout.flush()
            
            refData=networkData[
                networkData[systemGroupColumn].isin(referenceSystems)
            ]
            testData=networkData[
                networkData[systemGroupColumn].isin(referenceSystems).map(lambda x: not x)
            ]
            if verbose:
                if (verboseLevel>0):
                    t2=time.time()
                    print('-Splitting time: %.2f seconds'%(t2-t1))
                    if verboseLevel>1:
                        print('--- reference data ---')
                        print(refData.head())
                        print(refData.describe())
                        print('\n--- Test Data ---')
                        print(testData.head())
                        print(testData.describe())
                print("--- --- Running Bootstrapping --- ---")
                refSystemGroups=refData.groupby(systemGroupColumn)
                testSystemGroups=testData.groupby(systemGroupColumn)
                for refSystemGroup in refSystemGroups:
                    refSystemName,refSystemData=refSystemGroup
                    for testSystemGroup in testSystemGroups:
                        testSystemName,testSystemData=testSystemGroup
                        if verbose:
                            print('--- Testing',refSystemName,'vs',testSystemName,'---')
                        interactionGroups=refSystemData.groupby(interactionGroupColumns)
                        
                        resultTables=[]
                        bootstrapTables=[]
                        for iGroup,interactionGroup in enumerate(interactionGroups):
                            interactionName,refInteractionData=interactionGroup
                            interactionQuery=' and '.join([
                                "({colname} == {colval})".format(
                                    colname=cname,colval=cval
                                ) \
                                for cname,cval in zip(interactionGroupColumns,interactionName)
                            ])
                            if verbose:
                                print('-Testing: ',interactionQuery,end=": ")
                                if verboseLevel>0:
                                    t1=time.time()
                                sys.stdout.flush()
                            testInteractionData=testSystemData.query(interactionQuery)
                            refVals=np.array(refInteractionData[valueColumn])
                            testVals=np.array(testInteractionData[valueColumn])
                            jointVals=np.concatenate([refVals,testVals])
                            
                            nBootSamples=int(np.max([64,1./(np.min(alphas)/2.)**2]))
                            if verbose:
                                print('Bootstrapping Null',end=', ')
                                sys.stdout.flush()
                            nullBootData=np.zeros(nBootSamples)
                            for iBoot in np.arange(nBootSamples):
                                nullBootData[iBoot]=sp.stats.ks_2samp(
                                    np.random.choice(a=jointVals,size=len(jointVals),replace=True),
                                    jointVals
                                ).statistic
                            if verbose:
                                print('Ref',end=', ')
                                sys.stdout.flush()
                            refBootData=np.zeros(nBootSamples)
                            for iBoot in np.arange(nBootSamples):
                                refBootData[iBoot]=sp.stats.ks_2samp(
                                    np.random.choice(a=refVals,size=len(refVals),replace=True),
                                    jointVals
                                ).statistic
                            if verbose:
                                print('Test',end='; ')
                                sys.stdout.flush()
                            testBootData=np.zeros(nBootSamples)
                            for iBoot in np.arange(nBootSamples):
                                testBootData[iBoot]=sp.stats.ks_2samp(
                                    np.random.choice(a=testVals,size=len(testVals),replace=True),
                                    jointVals
                                ).statistic
                            
                            if writeBootstrapDistributions:
                                if verbose:
                                    print('Compiling Bootstrap Data Table',end="; ")
                                    sys.stdout.flush()
                                bootDataFrame=pd.DataFrame({
                                    'Null_KS':nullBootData,
                                    'Reference_KS':refBootData,
                                    'Test_KS':testBootData
                                })
                                bootDataFrame['Reference_'+systemGroupColumn]=refSystemName
                                bootDataFrame['Test_'+systemGroupColumn]=testSystemName
                                for gColName,gColVal in zip(interactionGroupColumns,interactionName):
                                    bootDataFrame[gColName]=gColVal
                                bootstrapTables.append(bootDataFrame.copy())
                                if (len(bootstrapTables)>=flushGroupCount) or \
                                   (iGroup == (len(interactionGroups)-1)):
                                    bootDataFrame=pd.concat(bootstrapTables)
                                    if verbose:
                                        print('Flushing Bootstrap Data',end='; ')
                                        sys.stdout.flush()
                                    if outputToCSV:
                                        bootDataFrame.to_csv(
                                            outputBase+'_Bootstrap_Data.csv')
                                    else:
                                        bootDataFrame.to_sql(
                                            outputBase+'_Bootstrap_Data',
                                            con=writeEngine,if_exists='append'
                                        )
                                    bootstrapTables=[]
                                    gc.collect()
                            
                            if verbose:
                                print('Compiling Results Table',end="; ")
                                sys.stdout.flush()
                            resultsFrame=pd.DataFrame({
                                'Alpha':alphas,
                                'nullCut':[
                                    np.quantile(nullBootData,q=1.-alpha/2.) \
                                    for alpha in alphas
                                ],
                                'refCut':[
                                    np.quantile(refBootData,q=alpha/2.) \
                                    for alpha in alphas
                                ],
                                'testCut':[
                                    np.quantile(testBootData,q=alpha/2.) \
                                    for alpha in alphas
                                ]
                            })
                            resultsFrame['Ref_Differs']=resultsFrame['refCut']>resultsFrame['nullCut']
                            resultsFrame['Test_Differs']=resultsFrame['testCut']>resultsFrame['nullCut']
                            resultsFrame['Reference_'+systemGroupColumn]=refSystemName
                            resultsFrame['Test_'+systemGroupColumn]=testSystemName
                            for gColName,gColVal in zip(interactionGroupColumns,interactionName):
                                resultsFrame[gColName]=gColVal
                            resultTables.append(resultsFrame.copy())
                            if (len(resultTables)>=flushGroupCount) or \
                               (iGroup == (len(interactionGroups)-1)):
                                if verbose:
                                    print('Flushing Results Tables',end=';')
                                    sys.stdout.flush()
                                resultsFrame=pd.concat(resultTables)
                                if outputToCSV:
                                    resultsFrame.to_csv(
                                        outputBase+'_Results.csv'
                                    )
                                else:
                                    resultsFrame.to_sql(
                                        outputBase+'_Results',
                                        con=writeEngine,if_exists='append'
                                    )
                                resultTables=[]
                                gc.collect()
                            
                            if verbose:
                                if verboseLevel>0:
                                    t2=time.time()
                                    print(' time=%.2f s'%(t2-t1),end="")
                                print("")
                                sys.stdout.flush()
                            gc.collect()
        if verbose and verboseLevel>0:
            tf=time.time()
            print('--- --- --- --- ---')
            print('Total Run Time: %.4f minutes'%((tf-ti)/60.))
    
###########################

[docs]def bootstrap_betweenness(databasePath,outputDatabase=None,querySQL='SELECT *; FROM Networks',systemGroupColumn='system',interactionGroupColumns=['Seqid_1','Seqid_2','Chain_Delta'],referenceSystems=['wt2'],valueColumn='Betweenness',alphas=[.1],testAllPairs=False,outputBase='Edge_Betweenness_KS',outputToCSV=False,writeBootstrapDistributions=False,flushGroupCount=1,dryrun=False,verbose=True,verboseLevel=0): """ Default ------- databasePath INPUT MUST BE GIVEN ouputDatabase databasePath querySQL 'SELECT *; FROM Networks' systemGroupColumn 'system' interactionGroupColumns ['Seqid_1','Seqid_2','Chain_Delta'] referenceSystems ['wt2'] valueColumn 'Betweenness' alphas [.1] testAllPairs False outputBase 'Edge_Betweenness_KS' outputToCSV False writeBootstrapDistributions False flushGroupCount 1 dryrun False verbose False verboseLevel 0 Example ------- current_flow_allostery.bootstrap_betweenness(\ 'output_2/GB_Network.db',\ 'output_3/GB_Betweenness_Bootstrapped_KS.db',\ querySQL='SELECT * FROM Networks WHERE (Seqid_1={1})',\ alphas=[0.05, 0.1, 0.15],\ writeBootstrapDistributions=True,\ flushGroupCount=25,\ verboseLevel=2\ ) Other notes ----------- """ if databasePath == None: print('Path to the directory containing the interaction network file. (required)') if outputDatabase == None: outputDatabase = databasePath ##### Start of code if verbose or dryrun: print('Input arguments:',databasePath,outputDatabase,querySQL,systemGroupColumn,interactionGroupColumns,referenceSystems,valueColumn,alphas,testAllPairs,outputBase,outputToCSV,writeBootstrapDistributions,flushGroupCount,dryrun,verbose,verboseLevel) if not dryrun: if verbose and verboseLevel>0: ti=time.time() if verbose: print("Starting up") ##### connstring='sqlite:///'+databasePath echo=( verbose and (verboseLevel > 2)) alphas=np.sort(np.unique(np.array(list(alphas)))) readSession, readEngine= db_m.db_setup(connstring,True,echo) if not (outputDatabase is None): connstring='sqlite:///'+outputDatabase writeSession, writeEngine = db_m.db_setup( connstring,False,echo) else: writeSession,writeEngine= db_m.db_setup(connstring,False,echo) ##### if verbose: print("Loading data using input SQL query") sys.stdout.flush() if verboseLevel>0: t1=time.time() networkData = pd.read_sql(querySQL,readEngine) if len(networkData)==0: print('Input query yielded no data! Exiting') else: if verbose: if verboseLevel>0: t2=time.time() print('-Input data query time: %.2f seconds'%(t2-t1)) t1=time.time() if verboseLevel>1: print('-- Data Extracted From Query --') print(networkData.head()) print(networkData.describe()) print('-- -- --') print('Splitting Reference and Test System Data') sys.stdout.flush() refData=networkData[ networkData[systemGroupColumn].isin(referenceSystems) ] testData=networkData[ networkData[systemGroupColumn].isin(referenceSystems).map(lambda x: not x) ] if verbose: if (verboseLevel>0): t2=time.time() print('-Splitting time: %.2f seconds'%(t2-t1)) if verboseLevel>1: print('--- reference data ---') print(refData.head()) print(refData.describe()) print('\n--- Test Data ---') print(testData.head()) print(testData.describe()) print("--- --- Running Bootstrapping --- ---") refSystemGroups=refData.groupby(systemGroupColumn) testSystemGroups=testData.groupby(systemGroupColumn) for refSystemGroup in refSystemGroups: refSystemName,refSystemData=refSystemGroup for testSystemGroup in testSystemGroups: testSystemName,testSystemData=testSystemGroup if verbose: print('--- Testing',refSystemName,'vs',testSystemName,'---') interactionGroups=refSystemData.groupby(interactionGroupColumns) resultTables=[] bootstrapTables=[] for iGroup,interactionGroup in enumerate(interactionGroups): interactionName,refInteractionData=interactionGroup interactionQuery=' and '.join([ "({colname} == {colval})".format( colname=cname,colval=cval ) \ for cname,cval in zip(interactionGroupColumns,interactionName) ]) if verbose: print('-Testing: ',interactionQuery,end=": ") if verboseLevel>0: t1=time.time() sys.stdout.flush() testInteractionData=testSystemData.query(interactionQuery) refVals=np.array(refInteractionData[valueColumn]) testVals=np.array(testInteractionData[valueColumn]) jointVals=np.concatenate([refVals,testVals]) nBootSamples=int(np.max([64,1./(np.min(alphas)/2.)**2])) if verbose: print('Bootstrapping Null',end=', ') sys.stdout.flush() nullBootData=np.zeros(nBootSamples) for iBoot in np.arange(nBootSamples): nullBootData[iBoot]=sp.stats.ks_2samp( np.random.choice(a=jointVals,size=len(jointVals),replace=True), jointVals ).statistic if verbose: print('Ref',end=', ') sys.stdout.flush() refBootData=np.zeros(nBootSamples) for iBoot in np.arange(nBootSamples): refBootData[iBoot]=sp.stats.ks_2samp( np.random.choice(a=refVals,size=len(refVals),replace=True), jointVals ).statistic if verbose: print('Test',end='; ') sys.stdout.flush() testBootData=np.zeros(nBootSamples) for iBoot in np.arange(nBootSamples): testBootData[iBoot]=sp.stats.ks_2samp( np.random.choice(a=testVals,size=len(testVals),replace=True), jointVals ).statistic if writeBootstrapDistributions: if verbose: print('Compiling Bootstrap Data Table',end="; ") sys.stdout.flush() bootDataFrame=pd.DataFrame({ 'Null_KS':nullBootData, 'Reference_KS':refBootData, 'Test_KS':testBootData }) bootDataFrame['Reference_'+systemGroupColumn]=refSystemName bootDataFrame['Test_'+systemGroupColumn]=testSystemName for gColName,gColVal in zip(interactionGroupColumns,interactionName): bootDataFrame[gColName]=gColVal bootstrapTables.append(bootDataFrame.copy()) if (len(bootstrapTables)>=flushGroupCount) or \ (iGroup == (len(interactionGroups)-1)): bootDataFrame=pd.concat(bootstrapTables) if verbose: print('Flushing Bootstrap Data',end='; ') sys.stdout.flush() if outputToCSV: bootDataFrame.to_csv( outputBase+'_Bootstrap_Data.csv') else: bootDataFrame.to_sql( outputBase+'_Bootstrap_Data', con=writeEngine,if_exists='append' ) bootstrapTables=[] gc.collect() if verbose: print('Compiling Results Table',end="; ") sys.stdout.flush() resultsFrame=pd.DataFrame({ 'Alpha':alphas, 'nullCut':[ np.quantile(nullBootData,q=1.-alpha/2.) \ for alpha in alphas ], 'refCut':[ np.quantile(refBootData,q=alpha/2.) \ for alpha in alphas ], 'testCut':[ np.quantile(testBootData,q=alpha/2.) \ for alpha in alphas ] }) resultsFrame['Ref_Differs']=resultsFrame['refCut']>resultsFrame['nullCut'] resultsFrame['Test_Differs']=resultsFrame['testCut']>resultsFrame['nullCut'] resultsFrame['Reference_'+systemGroupColumn]=refSystemName resultsFrame['Test_'+systemGroupColumn]=testSystemName for gColName,gColVal in zip(interactionGroupColumns,interactionName): resultsFrame[gColName]=gColVal resultTables.append(resultsFrame.copy()) if (len(resultTables)>=flushGroupCount) or \ (iGroup == (len(interactionGroups)-1)): if verbose: print('Flushing Results Tables',end=';') sys.stdout.flush() resultsFrame=pd.concat(resultTables) if outputToCSV: resultsFrame.to_csv( outputBase+'_Results.csv' ) else: resultsFrame.to_sql( outputBase+'_Results', con=writeEngine,if_exists='append' ) resultTables=[] gc.collect() if verbose: if verboseLevel>0: t2=time.time() print(' time=%.2f s'%(t2-t1),end="") print("") sys.stdout.flush() gc.collect() if verbose and verboseLevel>0: tf=time.time() print('--- --- --- --- ---') print('Total Run Time: %.4f minutes'%((tf-ti)/60.))