Add scan flow MVP and local Axiom skill workspace

This snapshot establishes the camera-to-result recognition flow and related tests while checking in the project skill/docs assets required for the configured local tooling.
This commit is contained in:
Matthias
2026-04-19 21:11:32 +02:00
parent 577214d474
commit a60a76b797
679 changed files with 138964 additions and 73 deletions

View File

@@ -0,0 +1,172 @@
import AVFoundation
import Combine
import OSLog
import UIKit
struct CameraSessionState {
private(set) var configurationDepth = 0
var canStartSession: Bool {
configurationDepth == 0
}
mutating func beginConfiguration() {
configurationDepth += 1
}
mutating func commitConfiguration() {
configurationDepth = max(0, configurationDepth - 1)
}
}
enum CameraServiceError: LocalizedError {
case unauthorized
case unavailable
case captureFailed
var errorDescription: String? {
switch self {
case .unauthorized:
return "Camera access is required to capture a card photo."
case .unavailable:
return "The device camera is unavailable."
case .captureFailed:
return "The photo could not be captured."
}
}
}
@MainActor
final class CameraService: NSObject, ObservableObject {
private static let logger = Logger(subsystem: "dev.matthiasmeister.StackDex", category: "CameraService")
@Published private(set) var authorizationStatus: AVAuthorizationStatus = AVCaptureDevice.authorizationStatus(for: .video)
@Published private(set) var isSessionRunning = false
@Published private(set) var isConfigured = false
let session = AVCaptureSession()
private let photoOutput = AVCapturePhotoOutput()
private var captureContinuation: CheckedContinuation<UIImage, Error>?
private var sessionState = CameraSessionState()
func prepareIfAuthorized() async {
authorizationStatus = AVCaptureDevice.authorizationStatus(for: .video)
guard authorizationStatus == .authorized else { return }
await configureAndStartSessionIfNeeded()
}
func requestAccessAndStart() async {
authorizationStatus = AVCaptureDevice.authorizationStatus(for: .video)
if authorizationStatus == .notDetermined {
authorizationStatus = await AVCaptureDevice.requestAccess(for: .video) ? .authorized : .denied
}
guard authorizationStatus == .authorized else { return }
await configureAndStartSessionIfNeeded()
}
func stopSession() {
if session.isRunning {
session.stopRunning()
isSessionRunning = false
}
}
func capturePhoto() async throws -> UIImage {
guard authorizationStatus == .authorized else {
throw CameraServiceError.unauthorized
}
if !isSessionRunning {
await configureAndStartSessionIfNeeded()
}
return try await withCheckedThrowingContinuation { continuation in
captureContinuation = continuation
photoOutput.capturePhoto(with: AVCapturePhotoSettings(), delegate: self)
}
}
private func configureAndStartSessionIfNeeded() async {
if isConfigured {
startSessionIfNeeded()
return
}
var shouldStartSession = false
sessionState.beginConfiguration()
Self.logger.debug("Camera session beginConfiguration depth=\(self.sessionState.configurationDepth)")
session.beginConfiguration()
do {
session.sessionPreset = .photo
guard let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .back) else {
throw CameraServiceError.unavailable
}
let input = try AVCaptureDeviceInput(device: device)
if !session.inputs.contains(where: { ($0 as? AVCaptureDeviceInput)?.device == device }) && session.canAddInput(input) {
session.addInput(input)
}
if !session.outputs.contains(photoOutput) && session.canAddOutput(photoOutput) {
session.addOutput(photoOutput)
}
isConfigured = true
shouldStartSession = true
} catch {
isConfigured = false
Self.logger.error("Camera session configuration failed: \(String(describing: error), privacy: .public)")
}
session.commitConfiguration()
sessionState.commitConfiguration()
Self.logger.debug("Camera session commitConfiguration depth=\(self.sessionState.configurationDepth)")
if shouldStartSession {
startSessionIfNeeded()
}
}
private func startSessionIfNeeded() {
guard sessionState.canStartSession else {
let message = "Attempted to start AVCaptureSession before commitConfiguration completed."
Self.logger.fault("\(message, privacy: .public)")
assertionFailure(message)
return
}
guard !session.isRunning else { return }
Self.logger.debug("Starting camera session")
session.startRunning()
isSessionRunning = true
}
}
extension CameraService: AVCapturePhotoCaptureDelegate {
nonisolated func photoOutput(_ output: AVCapturePhotoOutput, didFinishProcessingPhoto photo: AVCapturePhoto, error: Error?) {
Task { @MainActor in
if let error {
captureContinuation?.resume(throwing: error)
captureContinuation = nil
return
}
guard let data = photo.fileDataRepresentation(), let image = UIImage(data: data) else {
captureContinuation?.resume(throwing: CameraServiceError.captureFailed)
captureContinuation = nil
return
}
captureContinuation?.resume(returning: image)
captureContinuation = nil
}
}
}

View File

@@ -0,0 +1,64 @@
import UIKit
enum CardRecognitionPipelineError: LocalizedError {
case ocrUnavailable(String)
var errorDescription: String? {
switch self {
case .ocrUnavailable(let message):
return message
}
}
}
struct CardRecognitionPipeline {
let preprocessor: ImagePreprocessor
let networkStatusProvider: any NetworkStatusProviding
let cloudOCRClient: any CloudOCRClient
let fallbackOCR: any CardTextRecognizing
let extractor: CardTextHeuristicExtractor
let enhancer: any CardFieldEnhancing
init(
preprocessor: ImagePreprocessor = ImagePreprocessor(),
networkStatusProvider: any NetworkStatusProviding,
cloudOCRClient: any CloudOCRClient = StubCloudOCRClient(),
fallbackOCR: any CardTextRecognizing = VisionCardOCRService(),
extractor: CardTextHeuristicExtractor = CardTextHeuristicExtractor(),
enhancer: any CardFieldEnhancing = NoOpCardFieldEnhancer()
) {
self.preprocessor = preprocessor
self.networkStatusProvider = networkStatusProvider
self.cloudOCRClient = cloudOCRClient
self.fallbackOCR = fallbackOCR
self.extractor = extractor
self.enhancer = enhancer
}
func recognizeCard(in image: UIImage) async throws -> RecognitionSession {
let preparedImage = try preprocessor.prepare(image)
if networkStatusProvider.isOnline,
let cloudResponse = try? await cloudOCRClient.recognizeText(from: CloudOCRRequest(jpegData: preparedImage.uploadJPEGData)),
!cloudResponse.markdown.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
let cloudLines = cloudResponse.markdown
.split(whereSeparator: \.isNewline)
.map { RecognizedTextLine(text: String($0), confidence: 0.95, normalizedBounds: .zero) }
let payload = OCRTextPayload(rawText: cloudResponse.markdown, lines: cloudLines, averageConfidence: 0.95)
var draft = extractor.extract(payload: payload, source: .cloud)
draft = await enhancer.enhance(draft: draft, rawText: payload.rawText)
return RecognitionSession(draft: draft, thumbnailJPEGData: preparedImage.thumbnailJPEGData)
}
let note = networkStatusProvider.isOnline
? "Cloud OCR is stubbed in this MVP, so StackDex used the local Vision pipeline."
: "Offline erkannt — Ergebnis kann weniger genau sein."
let source: RecognitionSource = networkStatusProvider.isOnline ? .onDeviceFallback : .onDeviceOffline
let payload = try await fallbackOCR.recognizeText(in: preparedImage)
var draft = extractor.extract(payload: payload, source: source, notes: [note])
draft = await enhancer.enhance(draft: draft, rawText: payload.rawText)
return RecognitionSession(draft: draft, thumbnailJPEGData: preparedImage.thumbnailJPEGData)
}
}

View File

@@ -0,0 +1,155 @@
import CoreGraphics
import Foundation
struct CardTextHeuristicExtractor {
private let numberRegex = try? NSRegularExpression(pattern: #"\b\d{1,3}\s*/\s*\d{1,3}\b"#)
private let rarityKeywords: [(needle: String, rarity: CardRarity)] = [
("special art rare", .specialArtRare),
("illustration rare", .illustrationRare),
("hyper rare", .hyperRare),
("secret rare", .secretRare),
("ultra rare", .ultraRare),
("holo rare", .holoRare),
("uncommon", .uncommon),
("common", .common),
("rare", .rare),
]
func extract(payload: OCRTextPayload, source: RecognitionSource, notes: [String] = []) -> CardRecognitionDraft {
let cleanedLines = payload.lines
.map { line in
RecognizedTextLine(
text: normalize(line.text),
confidence: line.confidence,
normalizedBounds: line.normalizedBounds
)
}
.filter { !$0.text.isEmpty }
let rawText = cleanedLines.map(\.text).joined(separator: "\n")
let cardNumber = extractCardNumber(from: rawText)
let rarity = extractRarity(from: rawText)
let cardName = extractCardName(from: cleanedLines)
let setIdentifier = extractSetIdentifier(from: cleanedLines, cardNumber: cardNumber, rarity: rarity)
let foundCount = [cardName, cardNumber, setIdentifier, rarity == CardRarity.unknown.rawValue ? "" : rarity]
.filter { !$0.isEmpty }
.count
let confidence: ConfidenceLevel
if foundCount >= 3 && payload.averageConfidence >= 0.68 {
confidence = .high
} else if foundCount >= 2 && payload.averageConfidence >= 0.45 {
confidence = .medium
} else {
confidence = .low
}
var draftNotes = notes
if confidence != .high {
draftNotes.append(confidence.helperText)
}
if foundCount == 0 {
draftNotes.append("No structured match was found — manual entry is prefilled with OCR fragments.")
}
return CardRecognitionDraft(
cardName: cardName,
cardNumber: cardNumber,
setIdentifier: setIdentifier,
rarity: rarity,
source: source,
confidence: confidence,
notes: Array(Set(draftNotes)),
rawText: rawText
)
}
private func extractCardName(from lines: [RecognizedTextLine]) -> String {
let upperLines = lines
.filter { $0.normalizedBounds.midY > 0.45 }
.sorted { lhs, rhs in
if lhs.normalizedBounds.midY == rhs.normalizedBounds.midY {
return lhs.confidence > rhs.confidence
}
return lhs.normalizedBounds.midY > rhs.normalizedBounds.midY
}
let candidate = upperLines.first { line in
let text = line.text
return text.rangeOfCharacter(from: .decimalDigits) == nil &&
!text.localizedCaseInsensitiveContains("hp") &&
!text.localizedCaseInsensitiveContains("trainer") &&
text.count > 2
}
return candidate?.text ?? lines.first(where: { !$0.text.isEmpty })?.text ?? ""
}
private func extractCardNumber(from rawText: String) -> String {
guard let numberRegex else { return "" }
let range = NSRange(rawText.startIndex..., in: rawText)
guard let match = numberRegex.firstMatch(in: rawText, options: [], range: range),
let matchRange = Range(match.range, in: rawText) else {
return ""
}
return String(rawText[matchRange]).replacingOccurrences(of: " ", with: "")
}
private func extractSetIdentifier(from lines: [RecognizedTextLine], cardNumber: String, rarity: String) -> String {
guard !lines.isEmpty else { return "" }
if let lineContainingNumber = lines.first(where: { !cardNumber.isEmpty && $0.text.contains(cardNumber) }) {
let stripped = lineContainingNumber.text
.replacingOccurrences(of: cardNumber, with: "")
.replacingOccurrences(of: "", with: " ")
.replacingOccurrences(of: "·", with: " ")
.trimmingCharacters(in: .whitespacesAndNewlines)
if stripped.count > 2 {
return stripped
}
}
let bottomCandidates = lines
.filter { $0.normalizedBounds.midY < 0.35 }
.map(\.text)
.filter {
!$0.localizedCaseInsensitiveContains(rarity) &&
$0.rangeOfCharacter(from: .letters) != nil &&
!$0.localizedCaseInsensitiveContains("hp")
}
return bottomCandidates.first ?? ""
}
private func extractRarity(from rawText: String) -> String {
let lowered = rawText.lowercased()
if lowered.contains("") || lowered.contains("holo") {
return CardRarity.holoRare.rawValue
}
if lowered.contains("") {
return CardRarity.uncommon.rawValue
}
if lowered.contains("") {
return CardRarity.common.rawValue
}
for keyword in rarityKeywords {
if lowered.contains(keyword.needle) {
return keyword.rarity.rawValue
}
}
return CardRarity.unknown.rawValue
}
private func normalize(_ value: String) -> String {
value
.replacingOccurrences(of: " ", with: " ")
.trimmingCharacters(in: .whitespacesAndNewlines)
}
}

View File

@@ -0,0 +1,19 @@
import Foundation
struct CloudOCRRequest {
let jpegData: Data
}
struct CloudOCRTextResponse {
let markdown: String
}
protocol CloudOCRClient {
func recognizeText(from request: CloudOCRRequest) async throws -> CloudOCRTextResponse?
}
struct StubCloudOCRClient: CloudOCRClient {
func recognizeText(from request: CloudOCRRequest) async throws -> CloudOCRTextResponse? {
nil
}
}

View File

@@ -0,0 +1,14 @@
import Foundation
protocol CardFieldEnhancing {
func enhance(draft: CardRecognitionDraft, rawText: String) async -> CardRecognitionDraft
}
struct NoOpCardFieldEnhancer: CardFieldEnhancing {
func enhance(draft: CardRecognitionDraft, rawText: String) async -> CardRecognitionDraft {
// TODO: Replace this with a Foundation Models-backed enhancer once the runtime
// integration is ready and verified locally. Keeping the boundary injectable means
// the call site stays stable while the MVP compiles cleanly today.
draft
}
}

View File

@@ -0,0 +1,99 @@
import CoreImage
import UIKit
struct PreparedImage {
let normalizedImage: UIImage
let analysisCGImage: CGImage
let uploadJPEGData: Data
let thumbnailJPEGData: Data?
}
enum ImagePreprocessorError: LocalizedError {
case unableToCreateImage
var errorDescription: String? {
switch self {
case .unableToCreateImage:
return "The selected image could not be prepared for OCR."
}
}
}
struct ImagePreprocessor {
private let ciContext = CIContext()
func prepare(_ image: UIImage) throws -> PreparedImage {
let upright = normalized(image)
let resized = resizedImage(from: upright, maxDimension: 2_048)
let enhanced = enhancedImage(from: resized) ?? resized
guard let cgImage = makeCGImage(from: enhanced) else {
throw ImagePreprocessorError.unableToCreateImage
}
return PreparedImage(
normalizedImage: enhanced,
analysisCGImage: cgImage,
uploadJPEGData: enhanced.jpegData(compressionQuality: 0.82) ?? Data(),
thumbnailJPEGData: resizedImage(from: enhanced, maxDimension: 240).jpegData(compressionQuality: 0.65)
)
}
private func normalized(_ image: UIImage) -> UIImage {
guard image.imageOrientation != .up else { return image }
let renderer = UIGraphicsImageRenderer(size: image.size)
return renderer.image { _ in
image.draw(in: CGRect(origin: .zero, size: image.size))
}
}
private func resizedImage(from image: UIImage, maxDimension: CGFloat) -> UIImage {
let largestDimension = max(image.size.width, image.size.height)
guard largestDimension > maxDimension else { return image }
let scale = maxDimension / largestDimension
let targetSize = CGSize(width: image.size.width * scale, height: image.size.height * scale)
let renderer = UIGraphicsImageRenderer(size: targetSize)
return renderer.image { _ in
image.draw(in: CGRect(origin: .zero, size: targetSize))
}
}
private func enhancedImage(from image: UIImage) -> UIImage? {
guard let ciImage = CIImage(image: image) else { return nil }
let adjusted = ciImage
.applyingFilter("CIColorControls", parameters: [
kCIInputContrastKey: 1.08,
kCIInputSaturationKey: 0.96,
kCIInputBrightnessKey: 0.01,
])
.applyingFilter("CISharpenLuminance", parameters: [
kCIInputSharpnessKey: 0.35,
])
guard let cgImage = ciContext.createCGImage(adjusted, from: adjusted.extent) else {
return nil
}
return UIImage(cgImage: cgImage)
}
private func makeCGImage(from image: UIImage) -> CGImage? {
if let cgImage = image.cgImage {
return cgImage
}
if let ciImage = image.ciImage {
return ciContext.createCGImage(ciImage, from: ciImage.extent)
}
let renderer = UIGraphicsImageRenderer(size: image.size)
let rendered = renderer.image { _ in
image.draw(in: CGRect(origin: .zero, size: image.size))
}
return rendered.cgImage
}
}

View File

@@ -0,0 +1,37 @@
import Combine
import Foundation
import Network
protocol NetworkStatusProviding: AnyObject {
var isOnline: Bool { get }
func startMonitoring()
func stopMonitoring()
}
@MainActor
final class NetworkMonitor: ObservableObject, NetworkStatusProviding {
@Published private(set) var isOnline = true
private let monitor = NWPathMonitor()
private let queue = DispatchQueue(label: "StackDex.NetworkMonitor")
private var isMonitoring = false
func startMonitoring() {
guard !isMonitoring else { return }
isMonitoring = true
monitor.pathUpdateHandler = { [weak self] path in
let isSatisfied = path.status == .satisfied
DispatchQueue.main.async {
self?.isOnline = isSatisfied
}
}
monitor.start(queue: queue)
}
func stopMonitoring() {
guard isMonitoring else { return }
monitor.cancel()
isMonitoring = false
}
}

View File

@@ -0,0 +1,120 @@
import Combine
import Photos
import PhotosUI
import SwiftUI
import UIKit
struct RecentPhotoItem: Identifiable, Equatable {
let id: String
let asset: PHAsset
let thumbnail: UIImage
}
@MainActor
final class PhotoLibraryService: NSObject, ObservableObject {
@Published private(set) var authorizationStatus: PHAuthorizationStatus = PHPhotoLibrary.authorizationStatus(for: .readWrite)
@Published private(set) var recentPhotos: [RecentPhotoItem] = []
@Published private(set) var isLoading = false
private let imageManager = PHCachingImageManager()
var canBrowseRecents: Bool {
authorizationStatus == .authorized || authorizationStatus == .limited
}
var statusMessage: String {
switch authorizationStatus {
case .authorized:
return "Recent photos"
case .limited:
return "Limited photo access"
case .denied:
return "Photo access denied"
case .restricted:
return "Photo access restricted"
case .notDetermined:
return "Show recent photos"
@unknown default:
return "Photo access unavailable"
}
}
func requestAccessAndLoad() async {
if authorizationStatus == .notDetermined {
authorizationStatus = await PHPhotoLibrary.requestAuthorization(for: .readWrite)
}
await refreshRecentsIfPossible()
}
func refreshRecentsIfPossible() async {
authorizationStatus = PHPhotoLibrary.authorizationStatus(for: .readWrite)
guard canBrowseRecents else {
recentPhotos = []
return
}
isLoading = true
defer { isLoading = false }
let options = PHFetchOptions()
options.sortDescriptors = [NSSortDescriptor(key: "creationDate", ascending: false)]
options.fetchLimit = 12
let assets = PHAsset.fetchAssets(with: .image, options: options)
var results: [RecentPhotoItem] = []
assets.enumerateObjects { asset, _, _ in
results.append(contentsOf: self.thumbnailItem(for: asset).map { [$0] } ?? [])
}
recentPhotos = results
}
func loadImage(for item: RecentPhotoItem) async -> UIImage? {
await withCheckedContinuation { continuation in
let options = PHImageRequestOptions()
options.deliveryMode = .highQualityFormat
options.resizeMode = .fast
options.isNetworkAccessAllowed = true
imageManager.requestImageDataAndOrientation(for: item.asset, options: options) { data, _, _, _ in
continuation.resume(returning: data.flatMap(UIImage.init(data:)))
}
}
}
func presentLimitedLibraryPicker() {
guard let scene = UIApplication.shared.connectedScenes.first as? UIWindowScene,
let rootViewController = scene.keyWindow?.rootViewController else {
return
}
PHPhotoLibrary.shared().presentLimitedLibraryPicker(from: rootViewController)
}
func openSettings() {
guard let url = URL(string: UIApplication.openSettingsURLString) else { return }
UIApplication.shared.open(url)
}
private func thumbnailItem(for asset: PHAsset) -> RecentPhotoItem? {
let targetSize = CGSize(width: 180, height: 180)
let options = PHImageRequestOptions()
options.deliveryMode = .opportunistic
options.resizeMode = .fast
options.isSynchronous = true
var thumbnailImage: UIImage?
imageManager.requestImage(for: asset, targetSize: targetSize, contentMode: .aspectFill, options: options) { image, _ in
thumbnailImage = image
}
guard let thumbnailImage else { return nil }
return RecentPhotoItem(id: asset.localIdentifier, asset: asset, thumbnail: thumbnailImage)
}
}
private extension UIWindowScene {
var keyWindow: UIWindow? {
windows.first(where: \.isKeyWindow)
}
}

View File

@@ -0,0 +1,83 @@
import CoreGraphics
import Vision
struct RecognizedTextLine: Equatable {
let text: String
let confidence: Double
let normalizedBounds: CGRect
}
struct OCRTextPayload: Equatable {
let rawText: String
let lines: [RecognizedTextLine]
let averageConfidence: Double
}
protocol CardTextRecognizing {
func recognizeText(in image: PreparedImage) async throws -> OCRTextPayload
}
enum VisionCardOCRServiceError: LocalizedError {
case noRecognizedText
var errorDescription: String? {
switch self {
case .noRecognizedText:
return "Vision OCR could not find readable card text."
}
}
}
struct VisionCardOCRService: CardTextRecognizing {
private let customWords = [
"Pokémon", "Charizard", "Glurak", "Pikachu", "Blastoise", "Venusaur",
"Illustration", "Scarlet", "Violet", "Trainer", "Holo", "Rare", "Ultra",
"VMAX", "GX", "ex", "Base", "Set", "Promo", "Shiny"
]
func recognizeText(in image: PreparedImage) async throws -> OCRTextPayload {
try await withCheckedThrowingContinuation { continuation in
let request = VNRecognizeTextRequest { request, error in
if let error {
continuation.resume(throwing: error)
return
}
let observations = (request.results as? [VNRecognizedTextObservation]) ?? []
let lines = observations.compactMap { observation -> RecognizedTextLine? in
guard let candidate = observation.topCandidates(1).first else { return nil }
return RecognizedTextLine(
text: candidate.string,
confidence: Double(candidate.confidence),
normalizedBounds: observation.boundingBox
)
}
guard !lines.isEmpty else {
continuation.resume(throwing: VisionCardOCRServiceError.noRecognizedText)
return
}
let average = lines.map(\.confidence).reduce(0, +) / Double(lines.count)
continuation.resume(returning: OCRTextPayload(
rawText: lines.map(\.text).joined(separator: "\n"),
lines: lines,
averageConfidence: average
))
}
request.recognitionLevel = .accurate
request.recognitionLanguages = ["en-US", "de-DE"]
request.usesLanguageCorrection = true
request.customWords = customWords
let handler = VNImageRequestHandler(cgImage: image.analysisCGImage, options: [:])
do {
try handler.perform([request])
} catch {
continuation.resume(throwing: error)
}
}
}
}