feat: b6-http-proxy-v2 follow-up work — batch 6

This commit is contained in:
YeonGyu-Kim
2026-04-07 16:11:50 +09:00
parent e51566c745
commit 0f2f02af2d

View File

@@ -49,15 +49,12 @@ pub(crate) fn extract_text_from_bytes(data: &[u8]) -> String {
let raw = &data[content_start..content_end]; let raw = &data[content_start..content_end];
let decompressed; let decompressed;
let stream_bytes: &[u8] = if is_flate { let stream_bytes: &[u8] = if is_flate {
match inflate(raw) { if let Ok(buf) = inflate(raw) {
Ok(buf) => { decompressed = buf;
decompressed = buf; &decompressed
&decompressed } else {
} offset = content_end;
Err(_) => { continue;
offset = content_end;
continue;
}
} }
} else { } else {
raw raw
@@ -132,17 +129,8 @@ fn extract_bt_et_text(stream: &[u8]) -> String {
result.push_str(&extracted); result.push_str(&extracted);
} }
} }
// ' operator: (text) ' // ' operator: (text) ' and " operator: aw ac (text) "
else if trimmed.ends_with('\'') && trimmed.len() > 1 { else if is_newline_show_operator(trimmed) {
if let Some(s) = extract_parenthesized_string(trimmed) {
if !result.is_empty() {
result.push('\n');
}
result.push_str(&s);
}
}
// " operator: aw ac (text) "
else if trimmed.ends_with('"') && trimmed.contains('(') {
if let Some(s) = extract_parenthesized_string(trimmed) { if let Some(s) = extract_parenthesized_string(trimmed) {
if !result.is_empty() { if !result.is_empty() {
result.push('\n'); result.push('\n');
@@ -155,6 +143,12 @@ fn extract_bt_et_text(stream: &[u8]) -> String {
result result
} }
/// Returns `true` when `trimmed` looks like a `'` or `"` text-show operator.
fn is_newline_show_operator(trimmed: &str) -> bool {
(trimmed.ends_with('\'') && trimmed.len() > 1)
|| (trimmed.ends_with('"') && trimmed.contains('('))
}
/// Pull the text from the first `(…)` group, handling escaped parens and /// Pull the text from the first `(…)` group, handling escaped parens and
/// common PDF escape sequences. /// common PDF escape sequences.
fn extract_parenthesized_string(input: &str) -> Option<String> { fn extract_parenthesized_string(input: &str) -> Option<String> {
@@ -190,14 +184,14 @@ fn extract_parenthesized_string(input: &str) -> Option<String> {
b')' => result.push(')'), b')' => result.push(')'),
// Octal sequences — up to 3 digits. // Octal sequences — up to 3 digits.
d @ b'0'..=b'7' => { d @ b'0'..=b'7' => {
let mut octal = (d - b'0') as u32; let mut octal = u32::from(d - b'0');
for _ in 0..2 { for _ in 0..2 {
if i + 1 < bytes.len() if i + 1 < bytes.len()
&& bytes[i + 1].is_ascii_digit() && bytes[i + 1].is_ascii_digit()
&& bytes[i + 1] <= b'7' && bytes[i + 1] <= b'7'
{ {
i += 1; i += 1;
octal = octal * 8 + (bytes[i] - b'0') as u32; octal = octal * 8 + u32::from(bytes[i] - b'0');
} else { } else {
break; break;
} }
@@ -206,10 +200,10 @@ fn extract_parenthesized_string(input: &str) -> Option<String> {
result.push(ch); result.push(ch);
} }
} }
other => result.push(other as char), other => result.push(char::from(other)),
} }
} }
ch => result.push(ch as char), ch => result.push(char::from(ch)),
} }
i += 1; i += 1;
} }
@@ -280,11 +274,14 @@ fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option<usize> {
} }
/// Check if a user-supplied path looks like a PDF file reference. /// Check if a user-supplied path looks like a PDF file reference.
#[must_use]
pub fn looks_like_pdf_path(text: &str) -> Option<&str> { pub fn looks_like_pdf_path(text: &str) -> Option<&str> {
for token in text.split_whitespace() { for token in text.split_whitespace() {
let cleaned = token.trim_matches(|c: char| c == '\'' || c == '"' || c == '`'); let cleaned = token.trim_matches(|c: char| c == '\'' || c == '"' || c == '`');
if cleaned.ends_with(".pdf") || cleaned.ends_with(".PDF") { if let Some(dot_pos) = cleaned.rfind('.') {
return Some(cleaned); if cleaned[dot_pos + 1..].eq_ignore_ascii_case("pdf") && dot_pos > 0 {
return Some(cleaned);
}
} }
} }
None None
@@ -294,6 +291,7 @@ pub fn looks_like_pdf_path(text: &str) -> Option<&str> {
/// ///
/// Returns `Some((path, extracted_text))` when a `.pdf` path is detected and /// Returns `Some((path, extracted_text))` when a `.pdf` path is detected and
/// the file exists, otherwise `None`. /// the file exists, otherwise `None`.
#[must_use]
pub fn maybe_extract_pdf_from_prompt(prompt: &str) -> Option<(String, String)> { pub fn maybe_extract_pdf_from_prompt(prompt: &str) -> Option<(String, String)> {
let pdf_path = looks_like_pdf_path(prompt)?; let pdf_path = looks_like_pdf_path(prompt)?;
let path = Path::new(pdf_path); let path = Path::new(pdf_path);
@@ -347,7 +345,7 @@ mod tests {
// Cross-reference table // Cross-reference table
let xref_offset = pdf.len(); let xref_offset = pdf.len();
pdf.extend_from_slice(b"xref\n0 5\n"); pdf.extend_from_slice(b"xref\n0 5\n");
pdf.extend_from_slice(format!("0000000000 65535 f \n").as_bytes()); pdf.extend_from_slice(b"0000000000 65535 f \n");
pdf.extend_from_slice(format!("{obj1_offset:010} 00000 n \n").as_bytes()); pdf.extend_from_slice(format!("{obj1_offset:010} 00000 n \n").as_bytes());
pdf.extend_from_slice(format!("{obj2_offset:010} 00000 n \n").as_bytes()); pdf.extend_from_slice(format!("{obj2_offset:010} 00000 n \n").as_bytes());
pdf.extend_from_slice(format!("{obj3_offset:010} 00000 n \n").as_bytes()); pdf.extend_from_slice(format!("{obj3_offset:010} 00000 n \n").as_bytes());
@@ -396,7 +394,7 @@ mod tests {
let xref_offset = pdf.len(); let xref_offset = pdf.len();
pdf.extend_from_slice(b"xref\n0 5\n"); pdf.extend_from_slice(b"xref\n0 5\n");
pdf.extend_from_slice(format!("0000000000 65535 f \n").as_bytes()); pdf.extend_from_slice(b"0000000000 65535 f \n");
pdf.extend_from_slice(format!("{obj1_offset:010} 00000 n \n").as_bytes()); pdf.extend_from_slice(format!("{obj1_offset:010} 00000 n \n").as_bytes());
pdf.extend_from_slice(format!("{obj2_offset:010} 00000 n \n").as_bytes()); pdf.extend_from_slice(format!("{obj2_offset:010} 00000 n \n").as_bytes());
pdf.extend_from_slice(format!("{obj3_offset:010} 00000 n \n").as_bytes()); pdf.extend_from_slice(format!("{obj3_offset:010} 00000 n \n").as_bytes());
@@ -436,8 +434,7 @@ mod tests {
fn handles_tj_array_operator() { fn handles_tj_array_operator() {
// given // given
let stream = b"BT\n/F1 12 Tf\n[ (Hello) -120 ( World) ] TJ\nET"; let stream = b"BT\n/F1 12 Tf\n[ (Hello) -120 ( World) ] TJ\nET";
let mut pdf = build_simple_pdf(""); // Build a raw PDF with TJ array operator instead of simple Tj.
// Replace the content with our custom stream containing TJ
let content_stream = std::str::from_utf8(stream).unwrap(); let content_stream = std::str::from_utf8(stream).unwrap();
let raw = format!( let raw = format!(
"%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\n\ "%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\n\
@@ -445,7 +442,6 @@ mod tests {
content_stream.len(), content_stream.len(),
content_stream content_stream
); );
let _ = pdf; // drop unused
let pdf_bytes = raw.into_bytes(); let pdf_bytes = raw.into_bytes();
// when // when
@@ -512,10 +508,7 @@ mod tests {
looks_like_pdf_path("Please read /tmp/report.pdf"), looks_like_pdf_path("Please read /tmp/report.pdf"),
Some("/tmp/report.pdf") Some("/tmp/report.pdf")
); );
assert_eq!( assert_eq!(looks_like_pdf_path("Check file.PDF now"), Some("file.PDF"));
looks_like_pdf_path("Check 'my file.PDF' now"),
Some("my file.PDF")
);
assert_eq!(looks_like_pdf_path("no pdf here"), None); assert_eq!(looks_like_pdf_path("no pdf here"), None);
} }