branch: main
diff.rs
18100 bytesRaw
//! Diff engine: tree comparison and line-level file diffs.
//!
//! Provides recursive tree diffing (which files changed between two commits)
//! and line-level unified diffs via the `similar` crate.

use crate::store;
use serde::Serialize;
use std::collections::HashMap;
use worker::*;

type Url = worker::Url;

// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------

#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "lowercase")]
pub enum DiffStatus {
    Added,
    Deleted,
    Modified,
}

#[derive(Debug, Clone, Serialize)]
pub struct FileDiff {
    pub path: String,
    pub status: DiffStatus,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub old_hash: Option<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub new_hash: Option<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub hunks: Option<Vec<Hunk>>,
}

#[derive(Debug, Clone, Serialize)]
pub struct Hunk {
    pub old_start: usize,
    pub old_count: usize,
    pub new_start: usize,
    pub new_count: usize,
    pub lines: Vec<DiffLine>,
}

#[derive(Debug, Clone, Serialize)]
pub struct DiffLine {
    pub tag: &'static str, // "context", "add", "delete", "binary"
    pub content: String,
}

#[derive(Debug, Serialize)]
pub struct DiffStats {
    pub files_changed: usize,
    pub additions: usize,
    pub deletions: usize,
}

#[derive(Debug, Serialize)]
pub struct CommitDiff {
    pub commit_hash: String,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub parent_hash: Option<String>,
    pub files: Vec<FileDiff>,
    pub stats: DiffStats,
}

#[derive(Debug, Serialize)]
pub struct Comparison {
    pub base_hash: String,
    pub head_hash: String,
    pub files: Vec<FileDiff>,
    pub stats: DiffStats,
}

// ---------------------------------------------------------------------------
// Tree diff: recursive comparison of two trees
// ---------------------------------------------------------------------------

/// Compare two trees and return a flat list of file-level changes.
/// If `old_tree` is None, all files in `new_tree` are treated as added (root commit).
/// If `new_tree` is None, all files in `old_tree` are treated as deleted.
pub fn diff_trees(
    sql: &SqlStorage,
    old_tree: Option<&str>,
    new_tree: Option<&str>,
) -> Result<Vec<FileDiff>> {
    let mut diffs = Vec::new();
    diff_trees_recursive(sql, old_tree, new_tree, "", &mut diffs)?;
    diffs.sort_by(|a, b| a.path.cmp(&b.path));
    Ok(diffs)
}

fn diff_trees_recursive(
    sql: &SqlStorage,
    old_tree: Option<&str>,
    new_tree: Option<&str>,
    prefix: &str,
    diffs: &mut Vec<FileDiff>,
) -> Result<()> {
    let old_entries = match old_tree {
        Some(hash) => load_tree_entries(sql, hash)?,
        None => HashMap::new(),
    };
    let new_entries = match new_tree {
        Some(hash) => load_tree_entries(sql, hash)?,
        None => HashMap::new(),
    };

    // Collect all entry names, sorted for deterministic output
    let all_names: Vec<String> = {
        let mut set = std::collections::BTreeSet::new();
        for k in old_entries.keys() {
            set.insert(k.clone());
        }
        for k in new_entries.keys() {
            set.insert(k.clone());
        }
        set.into_iter().collect()
    };

    for name in &all_names {
        let full_path = if prefix.is_empty() {
            name.clone()
        } else {
            format!("{}/{}", prefix, name)
        };

        let old = old_entries.get(name);
        let new = new_entries.get(name);

        match (old, new) {
            // Same hash — nothing changed in this subtree/file
            (Some(o), Some(n)) if o.hash == n.hash => continue,

            // Both exist, different hashes
            (Some(o), Some(n)) => {
                let o_is_tree = o.mode == 0o040000;
                let n_is_tree = n.mode == 0o040000;

                match (o_is_tree, n_is_tree) {
                    (true, true) => {
                        // Both subtrees — recurse
                        diff_trees_recursive(sql, Some(&o.hash), Some(&n.hash), &full_path, diffs)?;
                    }
                    (false, false) => {
                        // Both blobs — file modified
                        diffs.push(FileDiff {
                            path: full_path,
                            status: DiffStatus::Modified,
                            old_hash: Some(o.hash.clone()),
                            new_hash: Some(n.hash.clone()),
                            hunks: None,
                        });
                    }
                    (true, false) => {
                        // Was subtree, now blob: delete old tree contents, add new blob
                        diff_trees_recursive(sql, Some(&o.hash), None, &full_path, diffs)?;
                        diffs.push(FileDiff {
                            path: full_path,
                            status: DiffStatus::Added,
                            old_hash: None,
                            new_hash: Some(n.hash.clone()),
                            hunks: None,
                        });
                    }
                    (false, true) => {
                        // Was blob, now subtree: delete old blob, add new tree contents
                        diffs.push(FileDiff {
                            path: full_path.clone(),
                            status: DiffStatus::Deleted,
                            old_hash: Some(o.hash.clone()),
                            new_hash: None,
                            hunks: None,
                        });
                        diff_trees_recursive(sql, None, Some(&n.hash), &full_path, diffs)?;
                    }
                }
            }

            // Only in old tree — deleted
            (Some(o), None) => {
                if o.mode == 0o040000 {
                    diff_trees_recursive(sql, Some(&o.hash), None, &full_path, diffs)?;
                } else {
                    diffs.push(FileDiff {
                        path: full_path,
                        status: DiffStatus::Deleted,
                        old_hash: Some(o.hash.clone()),
                        new_hash: None,
                        hunks: None,
                    });
                }
            }

            // Only in new tree — added
            (None, Some(n)) => {
                if n.mode == 0o040000 {
                    diff_trees_recursive(sql, None, Some(&n.hash), &full_path, diffs)?;
                } else {
                    diffs.push(FileDiff {
                        path: full_path,
                        status: DiffStatus::Added,
                        old_hash: None,
                        new_hash: Some(n.hash.clone()),
                        hunks: None,
                    });
                }
            }

            (None, None) => unreachable!(),
        }
    }

    Ok(())
}

struct TreeEntryInfo {
    mode: u32,
    hash: String,
}

fn load_tree_entries(sql: &SqlStorage, tree_hash: &str) -> Result<HashMap<String, TreeEntryInfo>> {
    #[derive(serde::Deserialize)]
    struct Row {
        name: String,
        mode: i64,
        entry_hash: String,
    }

    let rows: Vec<Row> = sql
        .exec(
            "SELECT name, mode, entry_hash FROM trees WHERE tree_hash = ?",
            vec![SqlStorageValue::from(tree_hash.to_string())],
        )?
        .to_array()?;

    let mut map = HashMap::with_capacity(rows.len());
    for row in rows {
        map.insert(
            row.name,
            TreeEntryInfo {
                mode: row.mode as u32,
                hash: row.entry_hash,
            },
        );
    }
    Ok(map)
}

// ---------------------------------------------------------------------------
// Line-level diff using `similar`
// ---------------------------------------------------------------------------

/// Compute a line-level diff between two blobs. Returns hunks with context.
pub fn diff_blobs(old_content: &[u8], new_content: &[u8], context_lines: usize) -> Vec<Hunk> {
    if is_binary(old_content) || is_binary(new_content) {
        return vec![Hunk {
            old_start: 0,
            old_count: 0,
            new_start: 0,
            new_count: 0,
            lines: vec![DiffLine {
                tag: "binary",
                content: "Binary files differ".to_string(),
            }],
        }];
    }

    let old_text = String::from_utf8_lossy(old_content);
    let new_text = String::from_utf8_lossy(new_content);

    let diff = similar::TextDiff::from_lines(old_text.as_ref(), new_text.as_ref());
    let mut hunks = Vec::new();

    for group in diff.grouped_ops(context_lines) {
        if group.is_empty() {
            continue;
        }

        let old_range_start = group.first().unwrap().old_range().start;
        let old_range_end = group.last().unwrap().old_range().end;
        let new_range_start = group.first().unwrap().new_range().start;
        let new_range_end = group.last().unwrap().new_range().end;

        let mut lines = Vec::new();
        for op in &group {
            for change in diff.iter_changes(op) {
                let tag = match change.tag() {
                    similar::ChangeTag::Equal => "context",
                    similar::ChangeTag::Insert => "add",
                    similar::ChangeTag::Delete => "delete",
                };
                lines.push(DiffLine {
                    tag,
                    content: change.value().to_string(),
                });
            }
        }

        hunks.push(Hunk {
            old_start: if old_range_start == old_range_end {
                0
            } else {
                old_range_start + 1
            },
            old_count: old_range_end - old_range_start,
            new_start: if new_range_start == new_range_end {
                0
            } else {
                new_range_start + 1
            },
            new_count: new_range_end - new_range_start,
            lines,
        });
    }

    hunks
}

/// Check for null bytes in the first 8 KiB — heuristic for binary content.
fn is_binary(data: &[u8]) -> bool {
    let check_len = data.len().min(8192);
    data[..check_len].contains(&0)
}

// ---------------------------------------------------------------------------
// High-level diff operations
// ---------------------------------------------------------------------------

/// Diff a commit against its first parent. Root commits diff against empty tree.
/// When `with_content` is true, line-level hunks are computed for each file.
pub fn diff_commit(
    sql: &SqlStorage,
    commit_hash: &str,
    with_content: bool,
    context_lines: usize,
) -> Result<CommitDiff> {
    let (tree_hash, parents) = load_commit_info(sql, commit_hash)?;

    let parent_tree = if let Some(parent_hash) = parents.first() {
        let (pt, _) = load_commit_info(sql, parent_hash)?;
        Some(pt)
    } else {
        None // root commit — everything is new
    };

    let mut files = diff_trees(sql, parent_tree.as_deref(), Some(&tree_hash))?;

    if with_content {
        populate_hunks(sql, &mut files, context_lines)?;
    }

    let stats = compute_stats(&files);

    Ok(CommitDiff {
        commit_hash: commit_hash.to_string(),
        parent_hash: parents.into_iter().next(),
        files,
        stats,
    })
}

/// Diff two commits by comparing their trees.
pub fn compare(
    sql: &SqlStorage,
    base_hash: &str,
    head_hash: &str,
    with_content: bool,
    context_lines: usize,
) -> Result<Comparison> {
    let (base_tree, _) = load_commit_info(sql, base_hash)?;
    let (head_tree, _) = load_commit_info(sql, head_hash)?;

    let mut files = diff_trees(sql, Some(&base_tree), Some(&head_tree))?;

    if with_content {
        populate_hunks(sql, &mut files, context_lines)?;
    }

    let stats = compute_stats(&files);

    Ok(Comparison {
        base_hash: base_hash.to_string(),
        head_hash: head_hash.to_string(),
        files,
        stats,
    })
}

/// Fill in line-level hunks for every file diff.
fn populate_hunks(sql: &SqlStorage, files: &mut [FileDiff], context_lines: usize) -> Result<()> {
    for file in files.iter_mut() {
        let old = match &file.old_hash {
            Some(h) => load_blob_content(sql, h)?.unwrap_or_default(),
            None => Vec::new(),
        };
        let new = match &file.new_hash {
            Some(h) => load_blob_content(sql, h)?.unwrap_or_default(),
            None => Vec::new(),
        };
        file.hunks = Some(diff_blobs(&old, &new, context_lines));
    }
    Ok(())
}

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

fn load_blob_content(sql: &SqlStorage, blob_hash: &str) -> Result<Option<Vec<u8>>> {
    #[derive(serde::Deserialize)]
    struct BlobInfo {
        group_id: i64,
        version_in_group: i64,
    }

    let rows: Vec<BlobInfo> = sql
        .exec(
            "SELECT group_id, version_in_group FROM blobs WHERE blob_hash = ?",
            vec![SqlStorageValue::from(blob_hash.to_string())],
        )?
        .to_array()?;

    match rows.into_iter().next() {
        Some(info) => {
            let content = store::reconstruct_blob(sql, info.group_id, info.version_in_group)?;
            Ok(Some(content))
        }
        None => Ok(None),
    }
}

fn load_commit_info(sql: &SqlStorage, hash: &str) -> Result<(String, Vec<String>)> {
    #[derive(serde::Deserialize)]
    struct CommitRow {
        tree_hash: String,
    }

    let commits: Vec<CommitRow> = sql
        .exec(
            "SELECT tree_hash FROM commits WHERE hash = ?",
            vec![SqlStorageValue::from(hash.to_string())],
        )?
        .to_array()?;

    let tree_hash = commits
        .into_iter()
        .next()
        .ok_or_else(|| Error::RustError(format!("commit not found: {}", hash)))?
        .tree_hash;

    #[derive(serde::Deserialize)]
    struct ParentRow {
        parent_hash: String,
    }

    let parents: Vec<String> = sql
        .exec(
            "SELECT parent_hash FROM commit_parents
             WHERE commit_hash = ? ORDER BY ordinal ASC",
            vec![SqlStorageValue::from(hash.to_string())],
        )?
        .to_array::<ParentRow>()?
        .into_iter()
        .map(|p| p.parent_hash)
        .collect();

    Ok((tree_hash, parents))
}

fn compute_stats(files: &[FileDiff]) -> DiffStats {
    let mut additions = 0usize;
    let mut deletions = 0usize;

    for file in files {
        if let Some(hunks) = &file.hunks {
            for hunk in hunks {
                for line in &hunk.lines {
                    match line.tag {
                        "add" => additions += 1,
                        "delete" => deletions += 1,
                        _ => {}
                    }
                }
            }
        } else {
            // No content diff — count at file level
            match file.status {
                DiffStatus::Added => additions += 1,
                DiffStatus::Deleted => deletions += 1,
                DiffStatus::Modified => {
                    additions += 1;
                    deletions += 1;
                }
            }
        }
    }

    DiffStats {
        files_changed: files.len(),
        additions,
        deletions,
    }
}

/// Try to resolve a string as a commit hash or ref name.
fn resolve_to_commit(sql: &SqlStorage, name: &str) -> Result<String> {
    // Check if it's already a known commit hash
    #[derive(serde::Deserialize)]
    struct Count {
        n: i64,
    }

    let existing: Count = sql
        .exec(
            "SELECT COUNT(*) AS n FROM commits WHERE hash = ?",
            vec![SqlStorageValue::from(name.to_string())],
        )?
        .one()?;

    if existing.n > 0 {
        return Ok(name.to_string());
    }

    // Try as a ref name
    match crate::api::resolve_ref(sql, name)? {
        Some(hash) => Ok(hash),
        None => Err(Error::RustError(format!(
            "'{}' is not a known commit or ref",
            name
        ))),
    }
}

// ---------------------------------------------------------------------------
// HTTP handlers
// ---------------------------------------------------------------------------

/// GET /diff/:sha?context=3&stat=1
pub fn handle_diff(sql: &SqlStorage, sha: &str, url: &Url) -> Result<Response> {
    if sha.is_empty() {
        return Response::error("missing commit hash", 400);
    }

    let context: usize = crate::api::get_query(url, "context")
        .and_then(|v| v.parse().ok())
        .unwrap_or(3);
    let stat_only = crate::api::get_query(url, "stat")
        .map(|v| v == "1")
        .unwrap_or(false);

    let result = diff_commit(sql, sha, !stat_only, context)?;
    Response::from_json(&result)
}

/// GET /compare/base...head?context=3&stat=1
pub fn handle_compare(sql: &SqlStorage, spec: &str, url: &Url) -> Result<Response> {
    // Accept both "base...head" (three-dot) and "base..head" (two-dot)
    let (base, head) = if let Some(pos) = spec.find("...") {
        (&spec[..pos], &spec[pos + 3..])
    } else if let Some(pos) = spec.find("..") {
        (&spec[..pos], &spec[pos + 2..])
    } else {
        return Response::error("use /compare/base...head or /compare/base..head", 400);
    };

    if base.is_empty() || head.is_empty() {
        return Response::error("both base and head are required", 400);
    }

    let base_hash = resolve_to_commit(sql, base)?;
    let head_hash = resolve_to_commit(sql, head)?;

    let context: usize = crate::api::get_query(url, "context")
        .and_then(|v| v.parse().ok())
        .unwrap_or(3);
    let stat_only = crate::api::get_query(url, "stat")
        .map(|v| v == "1")
        .unwrap_or(false);

    let result = compare(sql, &base_hash, &head_hash, !stat_only, context)?;
    Response::from_json(&result)
}