
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <stdio.h>
#include <unistd.h>
#include <ctype.h>

#ifdef STDC_HEADERS
#include <stdlib.h>
#include <string.h>
#endif

#include <sys/types.h>
#include <regex.h>

#include "mytag.h"
#include "error.h"
#include "getpara.h"
#include "fasta.h"
#include "pat.h"

#include "prosite.h"

#ifndef PROSITEDATA
#define PROSITEDATA "/usr/local/share/"PACKAGE
#endif

#ifndef REG_BASIC
#define REG_BASIC 0
#endif


static void usage(void);
static int scanpat(fasta_t *, para_t *);
static int processpat(pat_t *, fasta_t *, para_t *);
static void print_match(int, int, pat_t *, fasta_t *, int);


int main(int argc, char **argv) {
  para_t pa;
  int i, ret;
  FILE *f;
  char *db, *sfile, *datadir;
  fasta_t s;
  size_t len;

  if ((datadir = getenv("PROSITEDATA")) == NULL) {
    datadir = PROSITEDATA; }
  len = strlen(datadir) + 13;
  if ((db = (char *)malloc(len)) == NULL) {
    error_fatal("memory", NULL); }
  (void)sprintf(db, "%s/prosite.dat", datadir);

  /* default values */
  pa.flags = PARAMS_DEFAULT;
  pa.pformat = PROSITE;
  pa.pattern = db;
  pa.match = SHORT;

  /* FIXME: Check command line */
  while ((i = getopt(argc, argv, "cdhl:m:p:s")) != -1) {
    switch (i) {
    case 'c':
      pa.flags |= PARAMS_CASESENS;
      break;
    case 'd':
      pa.flags |= PARAMS_SHOWDOC;
      break;
    case 'h':
      usage();
      return EXIT_SUCCESS;
    case 'l':
      pa.pformat = LIST;
      pa.pattern = optarg;
      break;
    case 'm':
      switch (tolower(*optarg)) {
      case 'l':
        pa.match = LONG;
        break;
      case 's':
        pa.match = SHORT;
        break;
      default:
        pa.match = ALL; }
      break;
    case 'p':
      pa.pformat = PATTERN;
      pa.pattern = optarg;
      break;
    case 's':
      pa.flags |= PARAMS_ABUNDANT;
      break;
    default:
      usage();
      return EXIT_FAILURE; }
  }
  if (optind == argc) {
    (void)fprintf(stderr, "%s: missing sequence file\n", argv[0]);
    usage();
    return EXIT_FAILURE; }

  /* command line parsing */
  for (i = optind; i < argc; i++) {
    sfile = argv[i];

    if ((f = fopen(sfile, "r")) == NULL) {
      error_fatal(sfile, NULL); }

    /* sequence treatment */
    while ((ret = fasta_parse(&s, f)) > 0) {
      (void)scanpat(&s, &pa);
      fasta_free(&s);
    }

    if (fclose(f) != 0)
	error_fatal (sfile, NULL);
  }

  free(db);

  return EXIT_SUCCESS;
}

static int scanpat(fasta_t *s, para_t *pa) {
  int ret;
  pat_t p;
  FILE *PIN;
  prosite_t *pro;

  switch(pa->pformat) {
  case PATTERN:
    pat_init (&p);
    p.pat = pa->pattern;
    (void) processpat(&p, s, pa);
    break;
  case LIST:
    if ((PIN = fopen(pa->pattern, "r")) == NULL)
      error_fatal (pa->pattern, NULL);
    while ((ret = getplist (&p, PIN)) == OK) {
      (void) processpat(&p, s, pa);
      free (p.pat);
    }
    if (fclose(PIN) != 0)
      error_fatal(pa->pattern, NULL);
    break;
  default: /* case PROSITE: */
    if ((PIN = fopen(pa->pattern, "r")) == NULL) {
      error_fatal (pa->pattern, NULL); }
    while ((pro = prosite_parse(PIN)) != NULL) {
      if (pro->pat == NULL) {
	prosite_free(pro); continue; }
      p.id = pro->nam; p.ac = pro->acc; p.acdoc = pro->doc;
      p.desc = pro->dsc; p.pat = pro->pat;
      if (!(pro->flags & PROSITE_ABUNDANT) || (pa->flags & PARAMS_ABUNDANT)) {
	(void)processpat(&p, s, pa); }
      prosite_free(pro); }
    if (fclose(PIN) != 0)
      error_fatal(pa->pattern, NULL);
    break;
  }

  return OK;
}


static int processpat(pat_t *p, fasta_t *s, para_t *pa) {
  regex_t re;
  regmatch_t match;
  int start, end, oend;
  int hasmatch, err;
  char *seq, *seqi, *buf, *bufi;
  char *regpat;
  int reflags;
  long val;
  size_t bufsize;

  hasmatch = 0;
  if (s->seq == NULL) { return hasmatch; }

  /* compile pattern */
  regpat = prosite_pat2bre(p->pat);

  reflags = REG_BASIC;
  if ((pa->flags & PARAMS_CASESENS) == 0) { reflags |= REG_ICASE; }
  err = regcomp(&re, regpat, reflags);
  switch (err) {
  case 0: break; /* No error */
  case REG_BADBR:
    val = sysconf(_SC_RE_DUP_MAX);
    (void)fprintf(stderr, "%s: invalid pattern repetition (must be in [0,%ld] "
		  "range), skipped.\n", p->pat, val);
    return 0;
  default:
    (void)fprintf(stderr, "%s: invalid pattern, skipped\n", p->pat);
    return 0; }

  /* search pattern */
  seqi = seq = s->seq;

  /* allocate submatch buffer */
  bufsize = strlen(seq)+1;
  if ((buf = (char *) malloc(bufsize*sizeof(char))) == NULL)
    error_fatal("memory", NULL);

  bufi = buf;
  oend = end = 0;
  while(regexec(&re, seqi, 1, &match, 0) != REG_NOMATCH) {

    /* Reject nul match */
    if (match.rm_so == match.rm_eo)
      break;

    hasmatch++;
    oend = end;
    start = (seqi - seq) + match.rm_so;
    end = (seqi - seq) + match.rm_eo - 1;

    if (pa->match == LONG) {
      if (oend != end) { print_match(start, end, p, s, pa->flags); }
    }
    else {
      if (pa->match == ALL) { print_match(start, end, p, s, pa->flags); }

      (void) strncpy(buf, seq + start, end - start);
      *(buf+end-start) = '\0';

      /* lstrip */
      while(regexec(&re, buf, 1, &match, 0) != REG_NOMATCH) {
	/* Reject nul match */
	if (match.rm_so == match.rm_eo)
	  break;
	hasmatch++;
	end = start + match.rm_eo - 1;
	if (pa->match == ALL) { print_match(start, end, p, s, pa->flags); }
	*(buf+match.rm_eo - 1) = '\0';
      }

      /* rstrip */
      if (pa->match == SHORT) {
	*(buf+end-start) = *(seq+end);
	*(buf+end-start+1) = '\0';
	bufi = buf + 1 ;
	while(regexec(&re, bufi, 1, &match, 0) != REG_NOMATCH) {
	  /* Reject nul match */
	  if (match.rm_so == match.rm_eo)
	    break;
	  hasmatch++;
	  start = start + match.rm_so + 1;
	  bufi = bufi + match.rm_so + 1;
	}
	print_match(start, end, p, s, pa->flags);
      }
    }

    if (*regpat == '^') break;
    seqi = seq + start + 1;
  }

  regfree(&re);
  free(buf);

  return hasmatch;
}

static void print_match(int start, int end, pat_t *p, fasta_t *s, int flags) {
  char *w, *x, *y, *z;

  w = (s->nam == NULL) ? "unknown" : s->nam;
  x = (p->id == NULL) ? "unknown" : p->id;
  y = (p->acdoc == NULL) ? "unknown" : p->acdoc;
  z = (p->ac == NULL) ? p->pat : p->ac;

  (void)printf("%s %d - %d", w, start + 1, end + 1);
  if ((flags & PARAMS_SHOWDOC) != 0) {
    (void)printf(" %s %s", x, y); }
  else {
    (void)printf(" %s", z); }
  (void)printf(" %.*s\n", end - start + 1, s->seq + start);

  return; }


static void usage(void) {
  FILE *f = stdout;

  (void)fprintf(f, "usage: %s [options] <file> ...\n", PACKAGE);
  (void)fprintf(f, "\noptions:\n");
  (void)fprintf(f, "  -c       ... Do case-sensitive search.\n");
  (void)fprintf(f, "  -d       ... Show pattern documentation id.\n");
  (void)fprintf(f, "  -h       ... Print this message and exit.\n");
  (void)fprintf(f, "  -l <lst> ... Use patterns from file <lst>.\n");
  (void)fprintf(f, "  -m <fmt> ... Use report format <fmt>.\n");
  (void)fprintf(f, "  -p <pat> ... Search for pattern <pat>.\n");
  (void)fprintf(f, "  -s       ... Do not skip PROSITE abundant patterns.\n");

  return; }
