Keyven commited on
Commit
a81a250
·
verified ·
1 Parent(s): fd206e2

Upload schemas/generic_document.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. schemas/generic_document.json +107 -0
schemas/generic_document.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "german-ocr-3/schemas/generic_document.json",
4
+ "title": "GermanOCR3 Generic Document",
5
+ "description": "Allgemeines deutsches Dokument-Extraktionsschema. Felder dürfen null sein, wenn nicht eindeutig erkennbar.",
6
+ "type": "object",
7
+ "additionalProperties": false,
8
+ "required": ["document_type", "language", "raw_text", "confidence"],
9
+ "properties": {
10
+ "document_type": {
11
+ "description": "z.B. invoice, receipt, letter, form, contract, id_card, other, unknown",
12
+ "type": ["string", "null"]
13
+ },
14
+ "language": {
15
+ "description": "BCP-47 Sprachcode des Dokuments (typisch 'de').",
16
+ "type": "string",
17
+ "default": "de"
18
+ },
19
+ "sender": {
20
+ "type": ["object", "null"],
21
+ "additionalProperties": false,
22
+ "properties": {
23
+ "name": {"type": ["string", "null"]},
24
+ "address": {"type": ["string", "null"]},
25
+ "email": {"type": ["string", "null"]},
26
+ "phone": {"type": ["string", "null"]},
27
+ "tax_id": {"type": ["string", "null"]},
28
+ "vat_id": {"type": ["string", "null"]}
29
+ }
30
+ },
31
+ "recipient": {
32
+ "type": ["object", "null"],
33
+ "additionalProperties": false,
34
+ "properties": {
35
+ "name": {"type": ["string", "null"]},
36
+ "address": {"type": ["string", "null"]},
37
+ "customer_id": {"type": ["string", "null"]}
38
+ }
39
+ },
40
+ "date": {
41
+ "description": "Hauptdatum des Dokuments im Format YYYY-MM-DD, falls eindeutig erkennbar.",
42
+ "type": ["string", "null"]
43
+ },
44
+ "reference_numbers": {
45
+ "type": "array",
46
+ "items": {
47
+ "type": "object",
48
+ "additionalProperties": false,
49
+ "required": ["label", "value"],
50
+ "properties": {
51
+ "label": {"type": "string"},
52
+ "value": {"type": "string"}
53
+ }
54
+ },
55
+ "default": []
56
+ },
57
+ "amounts": {
58
+ "type": "array",
59
+ "items": {
60
+ "type": "object",
61
+ "additionalProperties": false,
62
+ "required": ["label", "value"],
63
+ "properties": {
64
+ "label": {"type": "string"},
65
+ "value": {"type": ["number", "string"]},
66
+ "currency": {"type": ["string", "null"]}
67
+ }
68
+ },
69
+ "default": []
70
+ },
71
+ "tables": {
72
+ "type": "array",
73
+ "default": [],
74
+ "items": {
75
+ "type": "object",
76
+ "additionalProperties": false,
77
+ "required": ["headers", "rows"],
78
+ "properties": {
79
+ "title": {"type": ["string", "null"]},
80
+ "headers": {"type": "array", "items": {"type": "string"}},
81
+ "rows": {
82
+ "type": "array",
83
+ "items": {
84
+ "type": "array",
85
+ "items": {"type": ["string", "number", "null"]}
86
+ }
87
+ }
88
+ }
89
+ }
90
+ },
91
+ "raw_text": {
92
+ "description": "Roher OCR-Text, möglichst layouterhaltend, deutsche Originalschreibweise behalten.",
93
+ "type": ["string", "null"]
94
+ },
95
+ "confidence": {
96
+ "description": "Subjektive Selbsteinschätzung 0..1.",
97
+ "type": ["number", "null"],
98
+ "minimum": 0,
99
+ "maximum": 1
100
+ },
101
+ "notes": {
102
+ "type": "array",
103
+ "items": {"type": "string"},
104
+ "default": []
105
+ }
106
+ }
107
+ }