// regular expressions for verifying if a sequence identifier is valid
// source: https://www.uniprot.org/help/accession_numbers
const UNIPROT_ACCESSION_REGEXP = RegExp(
  "^[OPQ][0-9][A-Z0-9]{3}[0-9]$|^[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$"
);

// source: https://www.uniprot.org/help/entry_name
const UNIPROT_ENTRY_NAME_REGEXP = RegExp("^[A-Z0-9]{1,10}_[A-Z0-9]{1,5}$");

// amino acid sequence (no non-standard AAs)
export const AMINO_ACID_SEQ_REGEXP = RegExp("^[ACDEFGHIKLMNPQRSTVWY]+$");

// amino acid sequence (including ambigouos amino acids)
export const EXTENDED_AMINO_ACID_SEQ_REGEXP = RegExp("^[ACDEFGHIKLMNPQRSTVWYBXZUO]+$");

// const SEQUENCE_DOWNLOAD_BASE_URL = "https://www.uniprot.org/uniprot/";
const SEQUENCE_DOWNLOAD_BASE_URL = "https://rest.uniprot.org/uniprotkb/";
const SEQUENCE_DOWNLOAD_EXTENSION = ".fasta";

export function parseSequenceBoxInput(input) {
  // remove whitespace on either side, and split on newline (Unix/Windows)
  let lines = input.trim().toUpperCase().split(/\r?\n/);
  let numLines = lines.length;

  let result = {
    empty: false,
    valid: false,
    nonStandard: false,
    identifier: null,
    identifierType: null,
    sequence: null,
    header: null
  };

  // TODO: handle empty case here... not covered by length== 0 below

  // let status = "";
  if (numLines === 0) {
    // this case shouldn't actually ever happen since empty string will produce [""]
    result.empty = true;
    result.valid = true;
  } else if (numLines === 1) {
    // if one line, to be valid, has to be either a raw amino acid sequence or an identifier
    let line = lines[0];
    // status = "(oneline)";

    if (line.length === 0) {
      result.empty = true;
      result.valid = true;
      // status += "(empty)";
    } else if (UNIPROT_ACCESSION_REGEXP.test(line)) {
      result.empty = false;
      result.valid = true;
      result.identifier = line;
      result.identifierType = "accession";
      // status += "(match accession)";
    } else if (UNIPROT_ENTRY_NAME_REGEXP.test(line)) {
      // status += "(match entry name)";
      result.empty = false;
      result.valid = true;
      result.identifier = line;
      result.identifierType = "name";
    } else if (EXTENDED_AMINO_ACID_SEQ_REGEXP.test(line.toUpperCase())) {
      // store sequence in any case
      result.empty = false;
      result.sequence = line;
      // make sure only canonical amino acids used
      if (AMINO_ACID_SEQ_REGEXP.test(line.toUpperCase())) {
        // status += "(match AA sequence)";
        result.valid = true;
        result.nonStandard = false;
      } else {
        // status += "(match AA sequence but invalid character)";
        result.valid = false;
        result.nonStandard = true;
        // TODO: set sequence
      }
    } else {
      // status += "(not a valid input)";
      result.empty = false;
      result.valid = false;
      // TODO: set status valid
    }
  } else {
    // if multi-line, to be valid this has to be FASTA or raw amino acid sequence
    // status = "(multiline)";
    let firstSeqLine = 0;

    // by definition not empty
    result.empty = false;

    // assume valid until proven otherwise
    result.valid = true;

    // check if we have a header line
    if (lines[0].startsWith(">")) {
      // status += "(has header)";
      result.header = lines[0];
      firstSeqLine = 1;
    }

    // store final compiled sequence here
    let aggregatedSequence = "";

    // go through rest of the lines
    for (let i = firstSeqLine; i < lines.length; i++) {
      let curLine = lines[i].trim().toUpperCase();

      // check if comment line or empty, then skip
      if (curLine.startsWith(";") || curLine.length === 0) {
        continue;
      }

      // check if valid sequence
      if (EXTENDED_AMINO_ACID_SEQ_REGEXP.test(curLine)) {
        // test if only canonical amino acids used
        // result.nonStandard =
        //  result.nonStandard || !AMINO_ACID_SEQ_REGEXP.test(curLine);
        if (!AMINO_ACID_SEQ_REGEXP.test(curLine)) {
          result.nonStandard = true;
          result.valid = false;
          break;
        }

        aggregatedSequence += curLine;
      } else {
        // if this is not a valid amino acid sequence, abort
        // status += "(invalid non-AA character)";
        result.valid = false;
        break;
      }
    }

    if (result.valid) {
      result.sequence = aggregatedSequence;
    }

    // status += "--- AGG:" + aggregatedSequence;
  }

  return result;
  //return status + " " + JSON.stringify(lines) + "///" + JSON.stringify(result);
}

export function getUniprotURL(identifier) {
  return SEQUENCE_DOWNLOAD_BASE_URL + identifier + SEQUENCE_DOWNLOAD_EXTENSION;
}

/*
  Verify if a protein standard contains only the 20 standard AA symbols
  (subdivided by completely invalid sequence and ambiguous symbols)
*/
export function verifyValidProteinSequence(sequence) {
  // does sequence pass regular 20 amino acids?
  let validStandardSeq = AMINO_ACID_SEQ_REGEXP.test(sequence);

  return {
    valid: validStandardSeq,
    // if it doesn't pass regular 20 AAs, does it pass extended alphabet?
    nonStandard:
      EXTENDED_AMINO_ACID_SEQ_REGEXP.test(sequence) && !validStandardSeq
  };
}

/*
  Parse a single FASTA entry from a string, returning header line
  and sequence as a single string
*/
export function parseFasta(entry) {
  let header = null;

  // remove whitespace and split lines
  let lines = entry.trim().split(/\r?\n/);

  // index of first non-header line
  let firstSeqLine = 0;
  let aggregatedSequence = "";

  // extract header if present
  if(lines[0].startsWith(">")) {
    header = lines[0];
    firstSeqLine = 1;
  }

  // assemble remaining lines into single string
  for (let i = firstSeqLine; i < lines.length; i++) {
    let curLine = lines[i].trim().toUpperCase();
    aggregatedSequence += curLine;
  }

  return {
    header: header,
    sequence: aggregatedSequence ? aggregatedSequence : null
  };
}

/*
  Extract identifier part of FASTA header line (anything that
  comes before first whitespace. This function additionally ignores
  any whitespace directly after > to be robust to user input
  problems)
*/
export function parseFastaHeader(headerLine) {
  // remove header line >
  if (headerLine.startsWith(">")) {
    headerLine = headerLine.substring(1);
  }

  // remove whitespace on either side
  headerLine = headerLine.trim();

  // return anything before first whitespace after first content
  let headerSplit = headerLine.split(/\s+/);

  return headerSplit[0];
}

export function getAlphaFoldUrl(uniprotAc) {
  return `https://alphafold.ebi.ac.uk/files/AF-${uniprotAc}-F1-model_v4.pdb`;
}