/*************************************************************
Program for finding clusters 
*************************************************************/


#include <stdio.h>
#include <string.h>

#define MAX_MATR 200
#define MATRIX_SIZE 20
#define MIN_LENGTH 300 
#define FOURTH_VALUE -99

/*************************************************************/
/* Variables */
/*************************************************************/

FILE *file_in;
FILE *file_out;
 
int  i,j,k,h; /* Counters */
int  matrix_counter;
int  max_matrix;
char command_arguments[5][100];
char matrix_list_file_name[100];
char sequence_file_name[100];
char threshold_file_name[100];
int thresholds[MAX_MATR];
char output_file_name[100];
char matrix_names[MAX_MATR][100];
char name[9][132];
int  matrix[2][MAX_MATR][MATRIX_SIZE][5];
char *chr_ptr;
char data_line[1000];
char names[5][1000];
int  window_n[1000];
char carattere;
int  seqn[1000];
int  end_of_file;
int  end_of_seq=0;
char seq1[1000],seq2[1000];
int  l1,l2,ll;
int  colpito,trovati;
long int position;
long int last_position;
int  base_comp[10];
int  debug;
int  seq_counter;
char bases[6]="ACGTN";
char strands[3]="+-";
int  score;
char matrix_comment[MAX_MATR][1000];
char hit_set[100000][1000];
int  hit_counter;
int  long rec_position[100000];
int  cluster;
/*************************************************************/
/* functions declaration */
/*************************************************************/

int open_file();
int retreive_matrix_file_name();
int retreive_matrices();
int take_this_matrix();
int invert_matrix();
int readline();
int take_slice();
int finestre_num();
int congiungi();
int get_header();
int conversione();
int get_info();
int make_score();
int analyze_the_results();
/*************************************************************/
/* M A I N */
/*************************************************************/

main (argc,argv)
int  argc;
char *argv[];
{

 /* Collect the command line arguments :
 1) File containing the matrix names to be used in the search. 
 2) File containing the sequence to be analyzed. 
 3) Threshold file.
 4) Output file name.
 */
  for (i=1;i<argc ; i++)
      {
       strcpy(command_arguments[i],argv[i]);
/*
       printf("%s\n" ,command_arguments[i]);
*/
      }
 /* retreivng names */
 strcpy(matrix_list_file_name,command_arguments[1]);
 strcpy(sequence_file_name,command_arguments[2]);
 strcpy(threshold_file_name,command_arguments[3]);
 strcpy(output_file_name,command_arguments[4]);
 
 printf("Looking for binding sites clustered in MIN_LENGTH bp\n"); 
/*
  printf("%s\t%s\t%s\t%s\n",	matrix_list_file_name,
				sequence_file_name,
				threshold_file_name,
				output_file_name); 
*/
 retreive_matrix_file_name(); 
 retreive_matrices();
 printf(" Matrices are OK .....now reading %s file\n",sequence_file_name);
 open_file(sequence_file_name);
 get_header();
 end_of_seq=0;
 while(1)
  {
   colpito=0;
   position=0;
   trovati=0;
   if (end_of_file) break;
   take_slice(seq1,&l1);
   while(1)
      {
       conversione();
/*
       printf("%s\n",seq1);
       for (debug=0;debug<ll;debug++)
           {
            printf("%d",seqn[debug]);
           }  
       printf("\n");
*/
       if (l1>MATRIX_SIZE-1) finestre_num();
       if (end_of_seq) break;
       take_slice(seq2,&l2);
       congiungi_n();
       }
    seq_counter++;
    end_of_seq=0;
    if(hit_counter>=2) analyze_the_results();
/*
    if(trovati==0) printf("Next item .....\n");
*/
    hit_counter=0;
    get_info(); 
    trovati=0;
    readline(); /* eliminate the first line */
    position=0;
  }
 return(0);
}



/*************************************************************/
/* functions */
/*************************************************************/


/*-------------------------------------------------------------------------*/
/* opens a file for reading */
open_file(stringa)
char stringa[1000];
{
 file_in= fopen(stringa,"r");
 if (file_in==NULL)
    {
     printf("\nFile %s not found\n",stringa);
     perror("");
     exit(0);
    }
 return(0);
}
/*-------------------------------------------------------------------------*/

retreive_matrix_file_name()
{
 int line_counter=0;
 open_file(matrix_list_file_name);
 chr_ptr=matrix_list_file_name;
 while(chr_ptr!=NULL)
      {
       chr_ptr=fgets(matrix_names[line_counter],
                     sizeof(matrix_names[line_counter]),
                     file_in);
       matrix_names[line_counter][strlen(matrix_names[line_counter])-1]='\0';
       line_counter++;
      }
 max_matrix=line_counter - 1;
 fclose(file_in);
 return(0);
}
/*-------------------------------------------------------------------------*/
take_this_matrix(file_name,matrix_ptr,matrix_counter)
char file_name[1000];
int *matrix_ptr;
int matrix_counter;
{
  char tmp_string[1000];
  char raw_data[1000];
  char jolly[1000];
  int base_counter;
  int spelling_counter;
  int value_counter;
  int position_counter;
  int temp_value;
  int temporaneo;
  int i,j;
  open_file(file_name);
  
  /* retreiving the comment line */
  chr_ptr=fgets(tmp_string,
                sizeof(tmp_string),
                file_in);


  /*stripping the 'MATRIX','\n'and limiting the lenghth */
  j=0;
  temporaneo=(strlen(tmp_string)-1);
  for (i=6;i<temporaneo;i++)
       {
        jolly[j]=tmp_string[i];
        j++;
       }  
  jolly[j]='\0';

  strcpy(matrix_comment[matrix_counter],jolly);

  /* retreiving the threshold */
   chr_ptr=fgets(tmp_string,
                sizeof(tmp_string),
                file_in);
   sscanf(tmp_string,"%s %d",jolly,&thresholds[matrix_counter]);

  /* skipping the next line */
  chr_ptr=fgets(tmp_string,sizeof(tmp_string),file_in);
  for (position_counter=0;
       position_counter<MATRIX_SIZE;
       position_counter++)
      {
       chr_ptr=fgets(tmp_string,sizeof(tmp_string),file_in);
       sscanf(tmp_string,"%d\t%d\t%d\t%d\t%d\n",
              &temp_value,
              (matrix_ptr+(position_counter*5)+ 0),
              (matrix_ptr+(position_counter*5)+ 1),
              (matrix_ptr+(position_counter*5)+ 2),
              (matrix_ptr+(position_counter*5)+ 3)
              );
       *(matrix_ptr+(position_counter*5)+ 4)=FOURTH_VALUE;
      }  
  fclose(file_in);
/*
  printf("*************************************\n");
*/
  return(0);
 }
/*-------------------------------------------------------------------------*/
invert_matrix(input_ptr,output_ptr)
int *input_ptr;
int *output_ptr;
{
 int base_counter,position_counter;

 for (base_counter=0;base_counter<5;base_counter++)
     {
      for (position_counter=0;position_counter<MATRIX_SIZE;position_counter++)
          {
           (*(output_ptr+(position_counter*5)+(base_counter)))=
           (*(input_ptr+((MATRIX_SIZE-position_counter-1)*5)+(4-base_counter-1)));
          }
     }
 return(0);
} 

/*-------------------------------------------------------------------------*/
retreive_matrices()
{
 int position_counter,base_counter;
 for (matrix_counter=0;matrix_counter<max_matrix;matrix_counter++)
     {
      take_this_matrix(matrix_names[matrix_counter],
                       &matrix[0]
                              [matrix_counter][0][0],
                              matrix_counter
                              );

      invert_matrix(&matrix[0][matrix_counter][0][0],
                    &matrix[1][matrix_counter][0][0]);
/* 
      printf("%s\nDIRECT:\n",matrix_names[matrix_counter]);
      for (base_counter=0;base_counter<5;base_counter++)
         {
          for(
              position_counter=0;
              position_counter<MATRIX_SIZE;
              position_counter++
             )
             {
              printf("%4d",matrix[0]
                                 [matrix_counter]
                                 [position_counter]
                                 [base_counter]);
             }
          printf("\n");
         }
      printf("%s\nINVERTED:\n",matrix_names[matrix_counter]);
      for (base_counter=0;base_counter<5;base_counter++)
         {
          for(
              position_counter=0;
              position_counter<MATRIX_SIZE;
              position_counter++
             )
             {
              printf("%4d",matrix[1]
                                 [matrix_counter]
                                 [position_counter]
                                 [base_counter]);
             }
           printf("\n");
          }   
     printf("%d\n\n\n",thresholds[matrix_counter]);
*/
     }
 return(0);
}        

/*-------------------------------------------------------------------------*/
readline()
{
 chr_ptr=fgets(data_line, sizeof(data_line), file_in);
 if (chr_ptr == NULL) end_of_file=1;
 return(0);
}
/*-------------------------------------------------------------------------*/
get_header()
/* gets rid of the first two lines of each sequence beginning */
{
 readline();
 strcpy(names[0],data_line);
 readline();
 strcpy(names[1],data_line);
 return(0);
}
/*-------------------------------------------------------------------------*/
take_slice(seq,l_ptr)
char *seq;
int *l_ptr;
{
 int ll=0;
 int i,j;
 char cls_line[1000];
 readline();
 ll=strlen(data_line);
 j =0;
 for (i=0;i<ll-1;i++) /* ll-1 for avoiding \n at the end */
      {
      if ((data_line[i]!=' ')&&(data_line[i]!='\t'))
         {
          cls_line[j]=data_line[i];
          j++;
         }
     }
 cls_line[j]='\0';
 *l_ptr =strlen(cls_line);
 strcpy(seq,cls_line);
 return(0);
}
/*-------------------------------------------------------------------------*/
finestre_num()
{
 int finestra_i,finestra_j;
 for (finestra_i=0;finestra_i<ll - MATRIX_SIZE + 1;finestra_i++)
     {
      for (finestra_j=0;finestra_j<MATRIX_SIZE;finestra_j++)
          {
           window_n[finestra_j]=*(seqn+finestra_i+finestra_j);
          }
           make_score();
       /* For the statistics of the base composition of the bank */
       base_comp[window_n[0]]++;
      }
 return(0);
}
/*-------------------------------------------------------------------------*/
/* routine for converting a string (seq1) in a numeric format (seqn)*/
conversione()
{
 ll = strlen(seq1);
 for (i=0;i<ll;i++)
     {
      carattere=seq1[i];
      switch (carattere)
             {
              case 'A':
              case 'a':
                       seqn[i]=0;
                       break;
              case 'C':
              case 'c':
                       seqn[i]=1;
                       break;
              case 'G':
              case 'g':
                       seqn[i]=2;
                       break;
              case 'T':
              case 't':
                       seqn[i]=3;
                       break;
              case 'U':
              case 'u':
                       seqn[i]=3;
                       break;

              case '*':
                       end_of_seq=1;
                       seqn[i]=4; 
                       break;

               default:
                       seqn[i]=4;
                       break;
             }
     }
return(0);
}
/*-------------------------------------------------------------------------*/

congiungi_n()
{
 char tempo_whole[1000],tempo_last[10000];
 if (l2>0)
  {
   /* copy of the last MATRIX_SIZE-1 characters of the seq1 string */
   for(i=0;i<MATRIX_SIZE - 1;i++)
    {
     tempo_last[i]=*(seq1 + strlen(seq1) - (MATRIX_SIZE - 1)   + i );
    }
    tempo_last[MATRIX_SIZE-1]='\0';
    strcpy(tempo_whole,tempo_last); /* adding the MATRIX_SIZE-1 characters*/
    strcat(tempo_whole,seq2);       /* adding the next line of chars */
    strcpy(seq1,tempo_whole);       /* converting seq2--->seq1       */
    l1=strlen(seq1);
  }
 return(0);
}
/*-------------------------------------------------------------------------*/
get_info()
{
 int info_line_counter=0;
 char info_lines[1000][1000];
 while((data_line[0]!='>')&&(!end_of_file))
      {
       readline();
       if ((trovati)&&(data_line[0]!='>')) printf("%s",data_line);
       strcpy(info_lines[info_line_counter],data_line);
       strcat(info_lines[info_line_counter]," @"); 
       info_line_counter++;
      }
  if (trovati) printf("---------------------------------------------------\n"); 
 }
/*-------------------------------------------------------------------------*/
make_score()
{
 int delta=4;
 int min_length=200;
 char jolly[1000];
 int i;
 int strand;
 position++;
 for (matrix_counter=0;
      matrix_counter<max_matrix;
      matrix_counter++)
     {
      for (strand=0;strand<2;strand++)
          {
           score=0;
           for (i=0;i<MATRIX_SIZE;i++)
               {
                score=score+ matrix[strand]
                                   [matrix_counter]
                                   [i]
                                   [window_n[i]]; 
               } 
           if (score>thresholds[matrix_counter])
              {
               rec_position[hit_counter]=position;
               sprintf(hit_set[hit_counter],"%6ld nt.  score:%4d   ",
                       position,
                       score);

               for (i=0;i<MATRIX_SIZE;i++)
                   {
                    jolly[i]=bases[window_n[i]];
                   }
               jolly[i]='\0';
               strcat(hit_set[hit_counter],jolly);
               sprintf(jolly,"%c\t%s\n",strands[strand],matrix_comment[matrix_counter]);
               strcat(hit_set[hit_counter],jolly);
/*
              printf("%d\t%s",hit_counter,hit_set[hit_counter]);
*/

               if (hit_counter<1000) hit_counter++;
              }
          }
      }
 return(0);
}  
/*-------------------------------------------------------------------------*/
analyze_the_results()
{
 int distance[1000];
 int site_counter=0;
 for (i=0;i<hit_counter-1;i++)
     {
      /*calculating the distances between contigous sites */
      distance[i]=rec_position[i+1]-rec_position[i];
/*
      printf("+++%d\t%s",distance[i],hit_set[i]);
*/
     }
 cluster=0;
 for (i=0;i<hit_counter-1;i++)
     {
      if ((distance[i]<=MIN_LENGTH)&&(distance[i]>5))
         {    
          printf("%s",hit_set[i]);
          cluster=1;
          trovati=1;
          site_counter++;
         }
      else
         {
          if ((cluster)&&(distance[i]>5))
             {
              printf("%s",hit_set[i]);
              site_counter ++;
              printf("+-+-+-+-+-+-+-+-+--+-+-+\n");
              cluster=0;
             }
         }
     }
 if (cluster)
   { 
    printf("%s",hit_set[hit_counter-1]);
    site_counter++;
   }
 if (trovati) printf("%d+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+\n",site_counter);
    
 return(0);
}

