# @file analyze_filter.py
#
# Filters results in a SARIF file.
#
#            Apache License
#      Version 2.0, January 2004
#   http://www.apache.org/licenses/
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This file has been altered from its original form. Based on code in:
#   https://github.com/advanced-security/filter-sarif
#
# It primarily contains modifications made to integrate with the CodeQL plugin.
#
# Specifically:
#   https://github.com/advanced-security/filter-sarif/blob/main/filter_sarif.py
#
# View the full and complete license as provided by that repository here:
#   https://github.com/advanced-security/filter-sarif/blob/main/LICENSE
#
# SPDX-License-Identifier: Apache-2.0
##

import json
import logging
import re
from os import PathLike
from typing import Iterable, List, Tuple

from analyze.globber import match


def _match_path_and_rule(
    path: str, rule: str, patterns: Iterable[str]) -> bool:
    """Returns whether a given path matches a given rule.

    Args:
        path (str): A file path string.
        rule (str): A rule file path string.
        patterns (Iterable[str]): An iterable of pattern strings.

    Returns:
        bool: True if the path matches a rule. Otherwise, False.
    """
    result = True
    for s, fp, rp in patterns:
        if match(rp, rule) and match(fp, path):
            result = s
    return result


def _parse_pattern(line: str) -> Tuple[str]:
    """Parses a given pattern line.

    Args:
        line (str): The line string that contains the rule.

    Returns:
        Tuple[str]: The parsed sign, file pattern, and rule pattern from the
                    line.
    """
    sep_char = ':'
    esc_char = '\\'
    file_pattern = ''
    rule_pattern = ''
    seen_separator = False
    sign = True

    # inclusion or exclusion pattern?
    u_line = line
    if line:
        if line[0] == '-':
            sign = False
            u_line = line[1:]
        elif line[0] == '+':
            u_line = line[1:]

    i = 0
    while i < len(u_line):
        c = u_line[i]
        i = i + 1
        if c == sep_char:
            if seen_separator:
                raise Exception(
                    'Invalid pattern: "' + line + '" Contains more than one '
                    'separator!')
            seen_separator = True
            continue
        elif c == esc_char:
            next_c = u_line[i] if (i < len(u_line)) else None
            if next_c in ['+' , '-', esc_char, sep_char]:
                i = i + 1
                c = next_c
        if seen_separator:
            rule_pattern = rule_pattern + c
        else:
            file_pattern = file_pattern + c

    if not rule_pattern:
        rule_pattern = '**'

    return sign, file_pattern, rule_pattern


def filter_sarif(input_sarif: PathLike,
                 output_sarif: PathLike,
                 patterns: List[str],
                 split_lines: bool) -> None:
    """Filters a SARIF file with a given set of filter patterns.

    Args:
        input_sarif (PathLike): Input SARIF file path.
        output_sarif (PathLike): Output SARIF file path.
        patterns (PathLike): List of filter pattern strings.
        split_lines (PathLike): Whether to split lines in individual patterns.
    """
    if split_lines:
        tmp = []
        for p in patterns:
            tmp = tmp + re.split('\r?\n', p)
        patterns = tmp

    patterns = [_parse_pattern(p) for p in patterns if p]

    logging.debug('Given patterns:')
    for s, fp, rp in patterns:
        logging.debug(
            'files: {file_pattern}    rules: {rule_pattern} ({sign})'.format(
                file_pattern=fp,
                rule_pattern=rp,
                sign='positive' if s else 'negative'))

    with open(input_sarif, 'r') as f:
        s = json.load(f)

    for run in s.get('runs', []):
        if run.get('results', []):
            new_results = []
            for r in run['results']:
                if r.get('locations', []):
                    new_locations = []
                    for l in r['locations']:
                        # TODO: The uri field is optional. We might have to
                        #       fetch the actual uri from "artifacts" via
                        #       "index"
                        # (see https://github.com/microsoft/sarif-tutorials/blob/main/docs/2-Basics.md#-linking-results-to-artifacts)
                        uri = l.get(
                                    'physicalLocation', {}).get(
                                        'artifactLocation', {}).get(
                                            'uri', None)

                        # TODO: The ruleId field is optional and potentially
                        #       ambiguous. We might have to fetch the actual
                        #       ruleId from the rule metadata via the ruleIndex
                        #       field.
                        # (see https://github.com/microsoft/sarif-tutorials/blob/main/docs/2-Basics.md#rule-metadata)
                        ruleId = r['ruleId']

                        if (uri is None or
                            _match_path_and_rule(uri, ruleId, patterns)):
                            new_locations.append(l)
                    r['locations'] = new_locations
                    if new_locations:
                        new_results.append(r)
                else:
                    # locations array doesn't exist or is empty, so we can't
                    # match on anything. Therefore, we include the result in
                    # the output.
                    new_results.append(r)
            run['results'] = new_results

    with open(output_sarif, 'w') as f:
        json.dump(s, f, indent=2)