Source code for current_flow_allostery.betweenness

"""
This is the function that does the betweenness calculation
"""
try:
    import argparse
except ImportError:
    raise ImportError("require argparse")

#numerical / data packages
try:
    import numpy as np
    np.set_printoptions(threshold=10)
except ImportError:
    raise ImportError("require numpy")
try:
    import pandas as pd
except ImportError:
    raise ImportError("require pandas")
try:
    import scipy
    import scipy.sparse
    import scipy as sp
except ImportError:
    raise ImportError("require scipy")

#utilities
import os
import sys
import gc
import copy
import time

#self defined functions
if __name__ == "__main__":
    from functions import betweenness_calc as bt_calc
else:
    from .functions import betweenness_calc as bt_calc

if __name__ == "__main__":
    # Do something if this file is invoked on its own
    print("Invoking original")
    parser=argparse.ArgumentParser(description="Loads the specified GB interaction network and calculates the corresponding flow betweenness network")
    
    parser.add_argument(
        '-indir','--inputDirectory',default='.',dest='inDir',
        help='Path to the directory containing the interaction network file. Defaults to currently active directory'
    )
    parser.add_argument(
        '-i', '--interactionFileName',
        help='Name of the GB interaction energy data file to load (required). This data file should contain a'+\
             '\nsingle interaction network. If there are more networks / interactions present, you will need to'+\
             '\nuse the "-selectionQueryStrings" argument to specify a pandas.DataFrame.query search query string'+\
             '\nthat can select a single network from the data. If duplicate edges are present, the program'+\
             '\nwill crash or give unpredictable results'
    )
    
    parser.add_argument(
        '-q','--selectionQueryStrings',nargs='*',
        help='List of query strings to select entries from the interaction data that specify a single network'+\
             '\nto be analyzed. If multiple strings are provided, the results are concatenated using pd.concat'
    )
    parser.add_argument(
        '-c','--NodeColumns',default=['Resid_1','Resid_2'],dest='nodeColumns',nargs='*',
        help='Names of the columns containing the names of the interacting nodes for each interaction entry'+\
             '\nexactly two arguments should be given. If not only the first two entries will get used.'
    )
    parser.add_argument(
        '-e','--energyColumn',default='TOTAL',
        help='Name of the column containing the energy values of each interaction used to compute betweenness weights'
    )
    parser.add_argument(
        '-s','--sourceNodeNames',nargs='+',dest='sourceNodeNames',
        help='string to be fed to the pandas DataFrame.query function to collect a list of source node names'+\
             '\nif multiple entries are given, each will be fed and the results aggregated into a list of'+\
             '\nof unique node names'
    )
    parser.add_argument(
        '-t','--targetNodeNames',nargs='+',dest='targetNodeNames',
        help='string to be fed to the pandas DataFrame.query function to collect a list of target node names'+\
             '\nif multiple entries are given, each will be fed and the results aggregated into a list of'+\
             '\nof unique node names. Note: if there is any overlap. I.e. sourceNodes and targetNodes litst'+\
             '\ncontain some of the same node(s) then a warning will be thrown and the common nodes will be put'+\
             '\ninto the source node list.'
    )
    
    parser.add_argument(
        '-outdir','--outputFileDirectory',default='.',dest='outDir',
        help='Path of the directory to write output files to'
    )
    parser.add_argument(
        '-o','--outputFileNameBase',
        help='Base of the filenames e.g. edge betweenness would be in "outputFileNameBase.EdgeBetweenness.csv" (required)'
    )
    parser.add_argument(
        '-ft','--writeFullTable',nargs='?',default=False,const=True,
        help='If this flag is set, the output table will contain all data from each row of the input dataframe'+\
             'otherwise it will only contain the node columns and betweenness column'
    )
    parser.add_argument(
        '-wnvec','--writeNodeVector',nargs='?',const=True,default=False,
        help='If flag is given, node betweenness will also be computed written to "outputFileNameBase.NodeBetweenness.csv"'
    )
    
    
    parser.add_argument(
        '-windmap','--writeMatrixIndexToNodeNameMap',nargs='?',const=True,default=False,
        help='If this flag is given, a data frame containging the columns "MatInd" and "NodeName" is written to'+\
             '\n"outputFileNameBase.IndToNameMap.csv"'
    )
    
    parser.add_argument(
        '-dryrun',nargs='?',const=True,default=False,
        help='Dont run anything, jsut print out input argument namespace and end program'
    )
    
    parser.add_argument(
        '-v','--verbose',nargs='?',const=True,default=False,
        help='controls printing of progress / information to stdout during run'
    )
    parser.add_argument(
        '-vl','--verboseLevel',default=0,
        help='when verbose flag is given, controls the amount of detail printed'
    )
    
    args=parser.parse_args()
     
    if args.verbose or args.dryrun:
        print('Input arguments:',args)
    if not args.dryrun:
        verbose=args.verbose
        verboseLevel=int(args.verboseLevel)
        outFileBase=args.outputFileNameBase
        inputFile=args.inDir+'/'+args.interactionFileName
        if verbose:
            print('loading data',end='\n' if args.verboseLevel==0 else ",")
        tempData=pd.read_csv(inputFile)
        if (not (args.selectionQueryStrings is None)) and len(args.selectionQueryStrings) > 0:
            if verbose and (verboseLevel > 0):
                print('-filtering loaded data')
            interactionData=pd.concat([
                tempData.query(selectionQuery).copy() \
                for selectionQuery in args.selectionQueryStrings
            ])
            tempData=[]
        else:
            interactionData=tempData.copy()
            tempData=[]
        
        nodeColumn_1,nodeColumn_2=args.nodeColumns
        if verbose and (verboseLevel > 0):
            print('Building node name to index maps')
        nodeNames=np.unique(np.sort(np.concatenate([
                interactionData[nodeColumn_1].unique(),
                interactionData[nodeColumn_2].unique()])))
        nameToIndTable=pd.DataFrame({
            'NodeNames':np.array(nodeNames,dtype=str),
            'NodeInds':np.arange(len(nodeNames))
        })
        if args.writeMatrixIndexToNodeNameMap:
            if verbose and (verboseLevel>0):
                print('saving node name indexing map')
            nameToIndTable.to_csv(
                args.outDir+'/'+outFileBase+'.IndToNameMap.csv',
                index=False
            )
        
        if verbose and (verboseLevel > 1):
            print(interactionData.head())
            
        if verbose and (verboseLevel>0):
            print('building source node list')
        sourceNodeNames=np.array(args.sourceNodeNames)
        sourceNodes=np.array([
            nameToIndTable.set_index('NodeNames')['NodeInds'].loc[sourceNodeName] \
            for sourceNodeName in sourceNodeNames
        ])
        if verbose and (verboseLevel>1):
            print('source nodes:')
            print(pd.DataFrame({'NodeNames':sourceNodes,'MatrixIndices':sourceNodes}))
        
        if verbose and (verboseLevel>0):
            print('building target node list')
        targetNodeNames=np.array(args.targetNodeNames)
        targetNodes=np.array([
            nameToIndTable.set_index('NodeNames')['NodeInds'].loc[targetNodeName] \
            for targetNodeName in targetNodeNames
        ])
        if verbose and (verboseLevel>1):
            print('target nodes:')
            print(pd.DataFrame({'NodeNames':targetNodeNames,'MatrixIndices':targetNodes}))
                  
        if verbose:
            print('Constructing network matrix')
        netMat=np.array(sp.sparse.coo_matrix(
            (interactionData[args.energyColumn].abs(),
             (nameToIndTable.set_index('NodeNames')['NodeInds'].loc[interactionData[nodeColumn_1].map(str)],
              nameToIndTable.set_index('NodeNames')['NodeInds'].loc[interactionData[nodeColumn_2].map(str)])),
            shape=(len(nameToIndTable),len(nameToIndTable))
        ).todense())
        
        btwMat=np.array(bt_calc.getBtwMat(
            mat=netMat,sources=sourceNodes,targets=targetNodes,
            verbose=verbose,verboseLevel=verboseLevel,
            useProgressBar=False,useLegacyAlgorithm=False
        ))
        
        if verbose:
            print('Compiling betweenness table')
        btwTable=pd.DataFrame({
            nodeColumn_1:interactionData[nodeColumn_1],
            nodeColumn_2:interactionData[nodeColumn_2],
            'Betweenness':btwMat[(
                nameToIndTable.set_index('NodeNames')['NodeInds'].loc[
                    interactionData[nodeColumn_1].map(str)],
                nameToIndTable.set_index('NodeNames')['NodeInds'].loc[
                    interactionData[nodeColumn_2].map(str)]
            )]
        })
        
        if args.writeFullTable:
            if verbose:
                print('Joining Betweenness data to interaction data')
            btwTable=btwTable.set_index([nodeColumn_1,nodeColumn_2]).join(
                other=interactionData.set_index([nodeColumn_1,nodeColumn_2]),
                how='right')
        
        if verbose:
            print('Saving betweenness data')
        btwTable.to_csv(args.outDir+'/'+args.outputFileNameBase+'.EdgeBetweenness.csv',index=False)
        
        if args.writeNodeVector:
            if verbose:
                print('Computing node betweenness')
            nodeBtw=np.sum(btwMat,axis=1)/2.
            nodeTable=pd.DataFrame({
                'NodeName':nameToIndTable['NodeNames'],
                'Betweenness':nodeBtw[nameToIndTable['NodeInds']]
            })
            if verbose:
                print('Saving node betweenness data')
            nodeTable.to_csv(args.outDir+'/'+args.outputFileNameBase+'.NodeBetweenness.csv',index=False)


[docs]def betweenness(inDir,outDir,interactionFileName,outputFileNameBase='NO_NAME',selectionQueryStrings=None,nodeColumns=['Resid_1','Resid_2'],energyColumn='TOTAL',sourceNodeNames=None,targetNodeNames=None,writeFullTable=False,writeNodeVector=True,writeMatrixIndexToNodeNameMap=True,dryrun=False,verbose=True,verboseLevel=0): """ This function is the main function to call for the betweenness calculation. NOTE: This function take a total of 15 variables, make sure to give them a check and see what each option does Default ------- inDir INPUT SHOULD BE GIVEN outDir INPUT SHOULD BE GIVEN interactionFileName INPUT MUST BE GIVEN outputFileNameBase 'NO_NAME' selectionQueryStrings None nodeColumns ['Resid_1','Resid_2'] energyColumn 'TOTAL' sourceNodeNames None ### INPUT MUST BE SET targetNodeNames None ### INPUT MUST BE SET writeFullTable False writeNodeVector True writeMatrixIndexToNodeNameMap True dryrun False verbose True verboseLevel 0 Example ------- current_flow_allostery.betweenness(\ '{1}',\ '{2}',\ '{3}.csv',\ '{3}.Betweenness',\ sourceNodeNames=['14','240','466','692','918','1144'],\ targetNodeNames=['47','273','499','725','951','1177'],\ writeFullTable=True,\ verboseLevel=2\ ) Other notes ----------- If there are multiple files to be run, this can be ran parallel following the example in Step1_Run_GB_Network_Betweenness.slurm.bash. sourceNodeNames & targetNodeNames format is : ['#','#','#','#'] keep in mind if following format is used it will produce "key error" [#,#,#,#] """ #####Default Settings of the Variables if inDir == None: inDir = '.' if outDir == None: outDir = '.' if interactionFileName == None: print('INPUT FILENAME MISSING') if sourceNodeNames == None: print('CANNOT BE BLANK: check sourceNodeNames input') if targetNodeNames == None: print('CANNOT BE BLANK: check targetNodeNames input') #################### if verbose or dryrun: print('Input arguments:',inDir,outDir,interactionFileName,outputFileNameBase,selectionQueryStrings,nodeColumns,energyColumn,sourceNodeNames,targetNodeNames,writeFullTable,writeNodeVector,writeMatrixIndexToNodeNameMap,dryrun,verbose,verboseLevel) if not dryrun: verbose=verbose verboseLevel=int(verboseLevel) outFileBase=outputFileNameBase inputFile=inDir+'/'+interactionFileName if verbose: print('loading data',end='\n' if verboseLevel==0 else ",") tempData=pd.read_csv(inputFile) if (not (selectionQueryStrings is None)) and len(selectionQueryStrings) > 0: if verbose and (verboseLevel > 0): print('-filtering loaded data') interactionData=pd.concat([ tempData.query(selectionQuery).copy() \ for selectionQuery in selectionQueryStrings ]) tempData=[] else: interactionData=tempData.copy() tempData=[] nodeColumn_1=nodeColumns[0] nodeColumn_2=nodeColumns[1] if verbose and (verboseLevel > 0): print('Building node name to index maps') nodeNames=np.unique(np.sort(np.concatenate([ interactionData[nodeColumn_1].unique(), interactionData[nodeColumn_2].unique()]))) nameToIndTable=pd.DataFrame({ 'NodeNames':np.array(nodeNames,dtype=str), 'NodeInds':np.arange(len(nodeNames)) }) if writeMatrixIndexToNodeNameMap: if verbose and (verboseLevel>0): print('saving node name indexing map') nameToIndTable.to_csv( outDir+'/'+outFileBase+'.IndToNameMap.csv', index=False ) if verbose and (verboseLevel > 1): print(interactionData.head()) if verbose and (verboseLevel>0): print('building source node list') sourceNodeNames=np.array(sourceNodeNames) sourceNodes=np.array([ nameToIndTable.set_index('NodeNames')['NodeInds'].loc[sourceNodeName] \ for sourceNodeName in sourceNodeNames ]) if verbose and (verboseLevel>1): print('source nodes:') print(pd.DataFrame({'NodeNames':sourceNodes,'MatrixIndices':sourceNodes})) if verbose and (verboseLevel>0): print('building target node list') targetNodeNames=np.array(targetNodeNames) targetNodes=np.array([ nameToIndTable.set_index('NodeNames')['NodeInds'].loc[targetNodeName] \ for targetNodeName in targetNodeNames ]) if verbose and (verboseLevel>1): print('target nodes:') print(pd.DataFrame({'NodeNames':targetNodeNames,'MatrixIndices':targetNodes})) if verbose: print('Constructing network matrix') netMat=np.array(sp.sparse.coo_matrix( (interactionData[energyColumn].abs(), (nameToIndTable.set_index('NodeNames')['NodeInds'].loc[interactionData[nodeColumn_1].map(str)], nameToIndTable.set_index('NodeNames')['NodeInds'].loc[interactionData[nodeColumn_2].map(str)])), shape=(len(nameToIndTable),len(nameToIndTable)) ).todense()) btwMat=np.array(bt_calc.getBtwMat( mat=netMat,sources=sourceNodes,targets=targetNodes, verbose=verbose,verboseLevel=verboseLevel, useProgressBar=False,useLegacyAlgorithm=False )) if verbose: print('Compiling betweenness table') btwTable=pd.DataFrame({ nodeColumn_1:interactionData[nodeColumn_1], nodeColumn_2:interactionData[nodeColumn_2], 'Betweenness':btwMat[( nameToIndTable.set_index('NodeNames')['NodeInds'].loc[ interactionData[nodeColumn_1].map(str)], nameToIndTable.set_index('NodeNames')['NodeInds'].loc[ interactionData[nodeColumn_2].map(str)] )] }) if writeFullTable: if verbose: print('Joining Betweenness data to interaction data') btwTable=btwTable.set_index([nodeColumn_1,nodeColumn_2]).join( other=interactionData.set_index([nodeColumn_1,nodeColumn_2]), how='right') if verbose: print('Saving betweenness data') btwTable.to_csv(outDir+'/'+outputFileNameBase+'.EdgeBetweenness.csv',index=False) if writeNodeVector: if verbose: print('Computing node betweenness') nodeBtw=np.sum(btwMat,axis=1)/2. nodeTable=pd.DataFrame({ 'NodeName':nameToIndTable['NodeNames'], 'Betweenness':nodeBtw[nameToIndTable['NodeInds']] }) if verbose: print('Saving node betweenness data') nodeTable.to_csv(outDir+'/'+outputFileNameBase+'.NodeBetweenness.csv',index=False)