"""
utilities to detect and convert the sequences formats
"""

import os , os.path , re
from subprocess import Popen , PIPE


import logging
s_log = logging.getLogger('mobyle.converter')

import Mobyle.ConfigManager
from Mobyle.MobyleError import *

__cfg = Mobyle.ConfigManager.Config()

__extra_epydoc_fields__ = [('call', 'Called by','Called by')]


myCode2Format = { 1   : 'IG' ,
                2   : 'GENBANK' ,
                3   : 'NBRF' ,
                4   : 'EMBL' ,
                5   : 'GCG' ,
                #6   : 'DNASTRIDER' ,
                8   : 'FASTA' ,
                13  : 'RAW' ,
                14  : 'PIR' ,
                19  : 'XML' ,
                #23  : 'FFF' , TODO
                #24  : 'GFF' , TODO
                110 : 'SWISSPROT' ,
                120 : 'GDE'
                }


myFormat2Code = { 'IG'        : 1 ,
                'GENBANK'   : 2 ,
                'NBRF'      : 3 ,
                'EMBL'      : 4 ,
                'GCG'       : 5 ,
                'DNASTRIDER': 6 ,
                'FASTA'     : 8 ,
                'RAW'       : 13 ,
                'PIR'       : 14 ,
                'XML'       : 19 ,
                #'FFF' , TODO
                #'GFF' , TODO
                'SWISSPROT' : 110 ,
                'GDE'       : 120
                }



squizz2code = { 'IG'        : 1 ,
                'GENBANK'   : 2 ,
                'NBRF'      : 3 ,
                'EMBL'      : 4 ,
                'GCG'       : 5 ,
                #'DNASTRIDER': 6 ,
                'GDE'       : 120 ,
                'FASTA'     : 8 ,
                'RAW'       : 13 ,
                'PIR'       : 14 ,
                'SWISSPROT' : 110,
                }


code2squizz = { 1   : 'IG' ,
                2   : 'GENBANK' ,
                3   : 'NBRF' ,
                4   : 'EMBL' ,
                5   : 'GCG' ,
                #6   : 'DNASTRIDER' ,
                120 : 'GDE' ,
                8   : 'FASTA' ,
                13  : 'RAW' ,
                14  : 'PIR' ,
                110 : 'SWISSPROT' ,
                }



code2readseq = { 1 : 'IG|Stanford' ,
                 2 : 'GenBank|gb' ,
                 3 : 'NBRF'     ,
                 4 : 'EMBL|em'   ,      
                 5 : 'GCG'        ,
                 6 : 'DNAStrider'  ,    
                 #7 : 'Fitch' , #pas en ecriture ni lecture        
                 8 : 'Pearson|Fasta|fa' ,
                 #9 : 'Zuker' , #pas en ecriture ni lecture          
                 #10 : 'Olsen' , #pas en ecriture ni lecture          
                 #13 : 'Plain|Raw' , # not correctly supported recognize evry thing as raw 
                 14 : 'PIR|CODATA' ,     
                 #16 : 'ASN.1' , #pas en ecriture ni lecture         
                 #18 : 'Pretty' ,# pas en lecture        
                 19 : 'XML' ,            
                 #20 : 'BLAST', # pas en ecriture          
                 #21 : 'SCF', # pas en ecriture           
                 #23 : 'FlatFeat|FFF',    
                 #24 : 'GFF' ,            
                 }


readseq2code = {'IG|Stanford': 1 ,
                'GenBank|gb' : 2 ,
                'NBRF' : 3 ,
                'EMBL|em' : 4 ,
                'GCG' : 5 ,
                'DNAStrider': 6 ,
                #'Fitch': 7 ,
                'Pearson|Fasta|fa' : 8 ,
                #'Zuker' : 9 ,
                #'Olsen' : 10 ,
                #'Plain|Raw' : 13 ,# not correctly supported recognize evry thing as raw
                'PIR|CODATA' : 14 ,
                #'ASN.1' : 16 ,
                #'Pretty' : 18 ,
                'XML' : 19 ,
                #'BLAST' :20 ,
                #'SCF' : 21 ,
                #'FlatFeat|FFF' : 23 ,
                #'GFF' : 24 ,
                }



code2suffixe = {1   : '.ig' ,
                2   : '.gb' ,
                3   : '.nbrf' ,
                4   : '.embl' ,
                5   : '.gcg' ,
                6   : '.strider' ,
                8   : '.fasta' ,
                13  : '.raw' ,
                14  : '.pir' ,
                19  : '.xml' ,
                #23  : '.fff' ,
                #24  : '.gff' ,
                110 : '.sp' ,
                120 : '.gde'
                }

class UnsupportedFormatError( MobyleError ):
    pass

 
def supportedFormat():
    seqconverter = __cfg.seqconverter()
    squizz = []
    readseq = []
    if 'SQUIZZ' in seqconverter:
       squizz = code2squizz.keys()

    if 'READSEQ' in seqconverter:
       readseq= code2readseq.keys()
 
    codes = list( set( squizz ) | set ( readseq ) )
    return [ myCode2Format[ code ] for code in codes ]


#===============================================================================
# def supportedFormat():
#    seqconverter = __cfg.seqconverter()
#    formats = []
#    if 'SQUIZZ' in seqconverter:
#        formats = code2squizz.keys()
# 
#    if 'READSEQ' in seqconverter:
#        if formats:
#            formats += code2readseq.keys()
#        else:
#            formats = code2readseq.keys()
#            
#    formats.sort()
#    return [ myCode2Format[ fmt ] for fmt in  formats ]
#===============================================================================



        
def format2code( prg ,format ):
    """
    @param prg: the program which has detect the format sequence
    @type prg: string 'squizz' , 'readseq'
    @param format: the format sequence return by detect
    @type format: string
    @return: the code corresponding to a sequence format
    @rtype int
    @raise MobyleError: raise a L{MobyleError} if the format is not manage by Mobyle
    """
    try:
        if prg == 'squizz':
            code = squizz2code[ format ]
        if prg == 'readseq':
            code = readseq2code [ format ]
        return code
    except KeyError ,err:
        msg = "this format ( " + format + " ) is not manage by Mobyle"
        #s_log.error( msg )
        raise MobyleError, msg



def acceptCodes( converter ):
    """
    @param converter: the name of the sequence checker/converter
    @type converter: string
    @return: a list of codes accepted by the sequence cheker converter
    @rtype: list of int
    """
    if converter.upper() == 'SQUIZZ':
        return squizz2code.values()
    elif converter.upper() == 'READSEQ':
        return readseq2code.values()
    else:
        raise MobyleError , "this converter is not supported by Mobyle"




def detect( fileName , prg = None):
    """
    detect the Sequence format of a sequence file (use external sequence checker program: squizz ,readseq ). you should install at least one of this software.
    @param fileName: the path of the file.
    @type fileName: string
    @param prg: the name of a sequence checker/formater. if it specified force to use this program to detect the sequence format otherwise use the formatter definned in Config
    @type prg: string
    @return: a tuple of 3 elements
       - The name of the program which has detected the format
       - the code of this format
       - the string corresponding to this format.
    If the format could not be determined by any programs return (None, None, None)
    @rtype: tuple ( string prg, int inCode, string inFormat).
    @call: L{SequenceParameter.convert}
    @raise MobyleError:
     - if the fileName doesn't exist a MobyleError is raised
     - if prg doesn't match with a supported sequence format detector 
    """
    _error = None
    fileName = str( fileName )
    if not os.path.exists( fileName ):
        raise MobyleError , "SequenceConverter.detect " + str( fileName ) + " this file doesn't exist"
    
    if prg and __cfg.seqconverter( prg ) is not None:
        if prg == 'SQUIZZ' :
            return  squizzDetect( fileName )
        elif prg == 'READSEQ' :
            return readseqDetect( fileName )
        else:
            raise MobyleError , "this converter; " + prg + ", is not used in Mobyle"
       
    else:
        for prg in  __cfg.seqconverter():
            try:
                detected = eval( prg.lower() + 'Detect( "' + fileName + '" )' )

            except MobyleError , err:
                _error = err
                continue
            if detected and detected[1] is not None :
                return detected
        if _error is None:
            return ( None , None , None , None )
        else:
            raise MobyleError, _error


def squizzDetect( fileName ):
    """
    @param fileName: the path to the sequence file
    @type fileName: string
    @return: a tuple of 4 elements
        - the name of the sequence detector used 'squizz'
        - the format Number or None
        - the format sequence detected or None 
        - the number of sequences or None
    @rtype: ('squizz', int , string , int )
    """
    squizz_path =  __cfg.seqconverter( 'SQUIZZ' )
    if squizz_path is not None :
        try:
            squizz_pipe = Popen( [ squizz_path , "-Sn" , fileName ] ,
                                 shell = False ,
                                 stdout = None ,
                                 stdin = None ,
                                 stderr = PIPE
                                 )
            squizz_nb = True
            
        except OSError:
            raise MobyleError, "squizz exit abnormaly: " + err
       
        squizz_pipe.wait()
        if squizz_pipe.returncode != 0:
            err = ''.join( squizz_pipe.stderr.readlines() )
            #logger l'ereur
            # on a pas le nom du job ni sa cle ??
            match = re.search( "squizz: invalid option -- n" , err )
            if match:
                try:
                    squizz_pipe = Popen( [ squizz_path , "-S" , fileName ] ,
                                         shell = False ,
                                         stdout = None ,
                                         stdin = None ,
                                         stderr = PIPE
                                         )
                    squizz_nb = False
                except OSError , err :
                    raise MobyleError , err
           
                squizz_pipe.wait()
                if squizz_pipe.returncode != 0:
                    err = ''.join( squizz_pipe.stderr.readlines() )
                    #logger l'ereur
                    # on a pas le nom du job ni sa cle ??
                    raise MobyleError, "squizz exit abnormaly: " + err
           
            else:
                raise MobyleError, "squizz exit abnormaly: " + err
        
        
        for line in squizz_pipe.stderr :
            if squizz_nb:
                match = re.search( ": (.+) format, (\d+) entries\.$" ,  line)
            else:
                match = re.search( ": (.+) format.$" ,  line)
            if match :
                format = match.group(1)
                if squizz_nb:
                    seq_nb = int( match.group(2))
                else:
                    seq_nb = 0
                break
                                  
        if match and format != "UNKNOWN":
            fmt_nb = format2code( 'squizz' , format )
            return ('squizz', fmt_nb , format , seq_nb )
        else:
            return ('squizz', None , None  , None )


def readseqDetect( fileName ):
    """
    @param fileName: the path to the sequence file
    @type fileName: string
    @return: a tuple of 4 elements
        - the name of the sequence detector used 'readseq'
        - the format Number or None
        - the Format sequence detected or None 
        - the number sequence found or None
    @rtype: ('readseq', int , string , int )
    """
   
    readseq_path = __cfg.seqconverter( 'READSEQ' )
    if readseq_path is not None:
        # readseq generate automatically a genbank version of the input file
        # I don't care of this file thus I redirect it to /dev/null
        

        readseq_pipe = Popen( [ readseq_path , "-v" , "-o" , "/dev/null" , fileName ] ,
                              shell = False ,
                              stdout = None ,
                              stdin = None ,
                              stderr = PIPE
                              )
        readseq_pipe.wait()
        if readseq_pipe.returncode != 0:
            err = ''.join( readseq_pipe.stderr.readlines() )
            # logger l'ereur
            # on a pas le nom du job ni sa cle ??
            raise MobyleError, "readseq exit abnormaly: " + err

            
        pattern = re.compile("^Sequence.*format=\s+(\d+)\.\s+(\S+),")
        seq_nb = 0
        
        for line in readseq_pipe.stderr:
            match = re.search( pattern,  line )

            if match :
                #fmt_nb = match.group(1) 
                format = match.group(2)
                seq_nb = seq_nb + 1
            
            

        if match and format.find( 'unknown' ) == -1 and format.find( 'Plain' ) == -1 :
            try:
                fmt_nb = format2code( 'readseq' , format )
            except MobyleError: 
                #readseq could not recognize separetly sequence and alignment
                #if a valid alignment is submitt readseq recognize the format but this 
                #format will be not a alignment format and  format2code will raised a mobyleError
                return ( 'readseq' , None , None , None )
                  
            return ( 'readseq' , fmt_nb , format , seq_nb )

        else:
            #readseq doesn't manage the raw format correctly
            #it recognize nearly every thing as raw
            
            return ('readseq', None , None , None )







def convert( fileName , fmtList , force = False):
    """
    convert a sequence file in a format among the fmtList. the sequence converted
     is write in a new file.
    @param fileName: the name of the Sequence file to convert
    @type fileName: string
    @param fmtList: a list of the format sequence ( see mobyle.dtd <ELEMENT seqfmt> )
    @type fmtList: list of string
    @param force: if force is True, do the conversion even if the detected format is in the accepted formats.
    @return:
      - the name of the converter used
      - the format in
      - the format produce by the converter
      - the absolut path of the file containing the sequence generated by the converter
    or (None, xx , None , None) if the conversion isn't possible.
    @rtype: tuple (string, string , string ,string)
    @raise UnsupportedFormatError: if the inCode or the outCode are not suported
    @raise MobyleError: if something goes wrong during squizz convertion
      ( permission denied to write a file, to read the sequence file ...)
    @call: L{SequenceParameter.convert}
    """
    fileName = str( fileName )
    if not os.path.exists( fileName ):
        raise MobyleError , "convert: no such file " + str( fileName )

    codeList = [] 
    
    for fmt in fmtList:
        try:
            codeList.append( myFormat2Code[ fmt.upper() ] )
        except KeyError:
            pass
    if not codeList:
        if fmtList:
            msg = "the formats ( specified in the xml ) %s are not supported by Mobyle" % fmtList 
        else:
            msg = "there is no dataFormats in acceptedDataFormats"
            
        raise MobyleError ,msg

    for converter in __cfg.seqconverter():
        prg , inCode , inFormat , seq_nb = detect( fileName , prg = converter )

        if inCode is None:
            continue #try whith the next converter

        elif inCode in codeList:
            if force:
                commonCodes = [ inCode ]
            else:
                #      fmtPrg , fmtIn ,  inFileName ,fmtOut , outFileName , seq_nb
                return ( prg , inFormat , fileName , None , fileName , seq_nb )

        # commonCodes = list( set( fmtList ) & set( acceptCodes( converter ) ) )
        # to keep the fmtList order
        commonCodes =[ code for code in codeList if code in acceptCodes( converter )]

        for outCode in commonCodes :
            oriFileName = fileName + ".ori"
            os.rename( fileName , oriFileName )
            
            outFileName = os.path.splitext( fileName )[0] + code2suffixe[ outCode ]

            try:
                if converter == 'SQUIZZ':
                    prg , inFormat , outFormat , seq_nb = squizzConvert( oriFileName ,
                                                                         outCode ,
                                                                         inCode ,
                                                                         outFileName
                                                                         )
                elif converter == 'READSEQ':
                    prg , inFormat , outFormat , seq_nb = readseqConvert( oriFileName ,
                                                                          outCode ,
                                                                          inCode ,
                                                                          outFileName
                                                                          )
                else:
                    raise MobyleError, "this converter; " + converter + ", is not used in Mobyle"

            except UnsupportedFormatError:
                continue #try the next outFormat

            if outFormat is None:
                continue #try the next outFormat
            else:
                return ( prg , inFormat , oriFileName , outFormat , outFileName , seq_nb )

    #      fmtPrg , fmtIn ,  inFileName , fmtOut , outFileName , seq_nb
    return ( None , None ,     None   ,    None  ,    None   ,   None  )


def squizzConvert( fileName , outCode , inCode = None , outFileName = None ):
    """
    @param fileName: the name of the file containing the sequence to convert
    @type fileName: string
    @param outCode: the format number in wich we want to convert the sequence
    @type outCode: int
    @type inCode: the format number detected by squizzdetect .
      if it keep at None a detection pass will be done again.
    @type inCode: int
    @type outFileName: the name of the file where the converter must write the
     sequence converted. if it None a name will be generate from the filename
     with changing the extension. the extensions used are defined in  code2suffixe
    @raise UnsupportedFormatError: if the inCode or the outCode are not supported
    @raise MobyleError: if something goes wrong during squizz convertion
      ( permission denied to write a file, to read the sequence file ...)
    """
    squizz_path = __cfg.seqconverter( 'SQUIZZ' )
    outFormat = code2squizz[ outCode ]

    if outFileName is None:
        outFileName = os.path.splitext( fileName )[0] + code2suffixe[ outCode ]
        
    cmde = [ squizz_path ,
             "-S" ,
             "-n" ,
             "-c" , outFormat ,
             fileName
             ]
    
    if inCode :
        try:
            inFormat = squizz2code[ inCode ]
            cmde = [ squizz_path ,
                    "-S" ,
                    "-n" ,
                    "-c" , outFormat ,                    
                    "-f" , inFormat ,
                    fileName
                     ]
            
        except KeyError , err :
            pass

    squizz_nb = True

    try:
        outFile = open( outFileName , 'w' )
    except IOError ,err :
        # pb on ne connait pas l'id du job
        # il faut paut etre cree une erreur convert error
        # la trapper au niveau supeieur (core.py)
        # et seulement a ce niveau logger l'erreur
        sq_log.error( "can't write outFile:" + str( err ) )
        raise MobyleError , "Sequence Convertion Error: "+ str( err )

    
    try:
        squizz_pipe = Popen( cmde ,
                             shell  = False ,
                             stdout = outFile ,
                             stdin  = None ,
                             stderr = PIPE
                             )
    except OSError, err:
        raise MobyleError , str( err )
        
    squizz_pipe.wait()
    if squizz_pipe.returncode != 0:
            err = ''.join( squizz_pipe.stderr.readlines() )
            
            match = re.search( "squizz: invalid option -- n" , err )
            if match:
                try:
                    cmde = [ squizz_path ,
                            "-S" ,
                            "-c" , outFormat ,
                            fileName
                            ]                    
                    
                    if inCode :
                        try:
                            inFormat = squizz2code[ inCode ]
                            cmde = [ squizz_path ,
                                    "-S" ,
                                    "-c" , outFormat ,                    
                                    "-f" , inFormat ,
                                    fileName
                                    ]
            
                        except KeyError , err :
                            pass                    
                    
                    squizz_pipe = Popen( cmde ,
                                         shell  = False ,
                                         stdout = outFile ,
                                         stdin  = None ,
                                         stderr = PIPE
                                         )
                    squizz_nb = False
 
                except OSError , err :
                    raise MobyleError , err
           
                squizz_pipe.wait()
                if squizz_pipe.returncode != 0:
                    err = ''.join( squizz_pipe.stderr.readlines() )
                    #logger l'ereur
                    # on a pas le nom du job ni sa cle ??
                    raise MobyleError, "squizz exit abnormaly: " + err
           
            else: #the error doesn't come from -n option
                raise MobyleError, "squizz exit abnormaly: " + err
        
        
    outFile.close()
    err = ''.join( squizz_pipe.stderr.readlines() )
    if squizz_nb:
        match = re.search(  "(: \w+)?: (.+) format, (\d+) entries\.$",  err )
    else:
        match = re.search( "(: \w+)?: (.+) format\.$" , err )

    if match:
        detectFormat = match.group(2)
        if squizz_nb:
            seq_nb = int( match.group(3) )
        else :
            seq_nb = 0
    else:
        raise MobyleError , str( err )

      
    if squizz_pipe.returncode == 0:
        if match and detectFormat != "UNKNOWN":
            return ( 'squizz' , detectFormat , outFormat , seq_nb) 
        else:
            # the inFormat is not recognize  
            return ( 'squizz' , None , None , None , None )
    else:
        if match and detectFormat == "unsupported" :
            #if the specified format ( -f format ) is not supported 
            raise UnSupportedFormatError , err
        else:
            raise MobyleError , str( err )
            






def readseqConvert( fileName , outCode , inCode = None , outFileName = None ):
    """
    @param fileName: the name of the file containing the sequence to convert
    @type fileName: string
    @param outCode: the format number in wich we want to convert the sequence
    @type outCode: int
    @type inCode: the format number detected by squizzdtect 
    @type inCode: int
    @type outFileName: the name of the file where the converter must write the
     sequence converted. if it None a name will be generate from the filename
     with changing the extension. the extensions used are defined in  code2suffixe
    """
    
   
    
    readseq_path = __cfg.seqconverter( 'READSEQ' )
    outFormat = myCode2Format[ outCode ]

    if outFileName is None:
        outFileName = os.path.splitext( fileName )[0] + "." + code2suffixe[ outCode ]
    
    # if the result sequence file is not specified with the
    # -o readseq option. readseq write the output sequence
    # in file named infile + suffixe_correspondint to the -f option
    # but doesn't used the stdout 
    cmde = "%s -a -v -f %i -o %s %s" %( readseq_path ,
                                        outCode ,
                                        outFileName ,
                                        fileName
                                        )
    cmde = [readseq_path , "-a" , "-v" , "-f" , str( outCode ) , "-o" , outFileName , fileName ]
    try:
        #cmde = [readseq_path]
        #cmde += readseq_args
        readseq_pipe = Popen( cmde ,
                              shell  = False ,
                              stdout = None,
                              stdin  = None ,
                              stderr = PIPE
                              )

    except OSError, err:
        raise MobyleError , str( err )

    readseq_pipe.wait()

    if readseq_pipe.returncode != 0:
        err = ''.join( readseq_pipe.stderr.readlines() )
        raise MobyleError , "readseq exit abnormaly: " + err


    format_pattern = re.compile("^Sequence.*format=\s+(\d+)\.\s+(\S+),")
    seq_nb = 0
    
    for line in readseq_pipe.stderr:
        match_fmt = re.search( format_pattern ,  line )

        if match_fmt :
            format_nb = int( match_fmt.group(1) )
            format = match_fmt.group(2)
            seq_nb = seq_nb + 1
            
        elif line.find("No BioseqWriter for this format") != -1:
            #the format provide to -f option is not supported 
            raise UnSupportedFormatError , line

    if match_fmt and format.find( 'unknown' ) == -1 and format.find( 'Plain'):
          return ( 'readseq' , myCode2Format[ format_nb ] , outFormat  , seq_nb )
    else:
        try:
            # if readseq don't know the format (ex gde) it recognise it as Plain/raw
            # do a convertion but the final result is False !
            # thus the outFile must be erased
            os.unlink( outFileName )
        except IOError :
            pass

        return ( 'readseq' , None , None , None )

        






    



