Skip to content

Corrections

Make corrections to your data.

corrections_from_dict(corrections_dict)

Create a list of Correction objects from a simpler config for corrections using a Dict representation mapping keys to either the label to convert to or a tuple of (from_label, to_label) pairings or (List[from_labels], to_label) pairings if you want to convert as subset of labels at a time

Parameters:

Name Type Description Default
corrections_dict Dict[str, Any]

Corrections formatted dict e.g. { "united states": "GPE", "London": (["LOC"], "GPE") }

required

Raises:

Type Description
ValueError

If the format of the dict

Returns:

Type Description
List[Correction]
Source code in recon/corrections.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def corrections_from_dict(corrections_dict: Dict[str, Any]) -> List[Correction]:
    """Create a list of Correction objects from a simpler config for
    corrections using a Dict representation mapping keys to either the label to
    convert to or a tuple of (from_label, to_label) pairings or
    (List[from_labels], to_label)
    pairings if you want to convert as subset of labels at a time

    Args:
        corrections_dict (Dict[str, Any]): Corrections formatted dict
            e.g. {
                "united states": "GPE",
                "London": (["LOC"], "GPE")
            }

    Raises:
        ValueError: If the format of the dict

    Returns:
        [type]: [description]
    """
    corrections: List[Correction] = []
    for key, val in corrections_dict.items():
        if isinstance(val, str) or val is None:
            from_labels = ["ANY"]
            to_label = val
        elif isinstance(val, tuple):
            if isinstance(val[0], str):
                from_labels = [val[0]]
            else:
                from_labels = val[0]
            to_label = val[1]
        else:
            raise ValueError(
                "Cannot parse corrections dict. Value must be either a str of the"
                " label "
                + "to change the annotation to (TO_LABEL) or a tuple of (FROM_LABEL,"
                " TO_LABEL)"
            )
        corrections.append(
            Correction(annotation=key, from_labels=from_labels, to_label=to_label)
        )
    return corrections

fix_annotations(example, corrections, case_sensitive=False, dryrun=False)

Fix annotations in a copy of List[Example] data.

This function will NOT add annotations to your data. It will only remove erroneous annotations and fix the labels for specific spans.

Parameters:

Name Type Description Default
example Example

Input Example

required
corrections Dict[str, str]

Dictionary of corrections mapping entity text to a new label. If the value is set to None, the annotation will be removed

required
case_sensitive bool

Consider case of text for each correction

False
dryrun bool

Treat corrections as a dryrun and just print all changes to be made

False

Returns:

Name Type Description
Example Example

Example with fixed annotations

Source code in recon/corrections.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
@operation("recon.fix_annotations.v1")
def fix_annotations(
    example: Example,
    corrections: List[Correction],
    case_sensitive: bool = False,
    dryrun: bool = False,
) -> Example:
    """Fix annotations in a copy of List[Example] data.

    This function will NOT add annotations to your data.
    It will only remove erroneous annotations and fix the
    labels for specific spans.

    Args:
        example (Example): Input Example
        corrections (Dict[str, str]): Dictionary of corrections mapping
            entity text to a new label.
            If the value is set to None, the annotation will be removed
        case_sensitive (bool, optional): Consider case of text for each correction
        dryrun (bool, optional): Treat corrections as a dryrun
            and just print all changes to be made

    Returns:
        Example: Example with fixed annotations
    """

    if not case_sensitive:
        for c in corrections:
            c.annotation = c.annotation.lower()

    corrections_map: Dict[str, Correction] = {c.annotation: c for c in corrections}
    prints: List[str] = []

    ents_to_remove: List[int] = []
    for i, s in enumerate(example.spans):
        t = s.text if case_sensitive else s.text.lower()

        if t in corrections_map:
            c = corrections_map[t]
            if c.to_label is None and (
                s.label in c.from_labels or "ANY" in c.from_labels
            ):
                if dryrun:
                    prints.append(f"Deleting span: {s.text}")
                else:
                    ents_to_remove.append(i)
            elif s.label in c.from_labels or "ANY" in c.from_labels:
                if dryrun:
                    prints.append(
                        f"Correction span: {s.text} from labels: {c.from_labels} to"
                        f" label: {c.to_label}"
                    )
                else:
                    s.label = cast(str, c.to_label)

    i = len(ents_to_remove) - 1
    while i >= 0:
        idx = ents_to_remove[i]
        del example.spans[idx]
        i -= 1

    if dryrun:
        msg.divider("Example Text")
        msg.text(example.text)
        for line in prints:
            msg.text(line)

    return example

rename_labels(example, label_map)

Rename labels in a copy of List[Example] data

Parameters:

Name Type Description Default
example Example

Input Example

required
label_map Dict[str, str]

One-to-one mapping of label names

required

Returns:

Name Type Description
Example Example

Copy of Example with renamed labels

Source code in recon/corrections.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
@operation("recon.rename_labels.v1")
def rename_labels(example: Example, label_map: Dict[str, str]) -> Example:
    """Rename labels in a copy of List[Example] data

    Args:
        example (Example): Input Example
        label_map (Dict[str, str]): One-to-one mapping of label names

    Returns:
        Example: Copy of Example with renamed labels
    """
    for span in example.spans:
        span.label = label_map.get(span.label, span.label)
    return example

split_sentences(example, preprocessed_outputs={})

Split a single example into multiple examples by splitting the text into multiple sentences and resetting entity and token offsets based on offsets relative to sentence boundaries

Parameters:

Name Type Description Default
example Example

Input Example

required
preprocessed_outputs Dict[str, Any]

Outputs of preprocessors.

{}

Returns:

Type Description
List[Example]

List[Example]: List of split examples. Could be list of 1 if the example is just one sentence.

Source code in recon/corrections.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
@operation("recon.split_sentences.v1", pre=["recon.spacy.v1"])
def split_sentences(
    example: Example, preprocessed_outputs: Dict[str, Any] = {}
) -> List[Example]:
    """Split a single example into multiple examples by splitting the text into
    multiple sentences and resetting entity and token offsets based on offsets
    relative to sentence boundaries

    Args:
        example (Example): Input Example
        preprocessed_outputs (Dict[str, Any], optional): Outputs of preprocessors.

    Returns:
        List[Example]: List of split examples.
            Could be list of 1 if the example is just one sentence.
    """
    doc = preprocessed_outputs["recon.spacy.v1"]

    new_examples = []
    ents = []
    for ent in example.spans:
        span = doc.char_span(ent.start, ent.end, label=ent.label)
        if not span:
            token = None
            text = doc.text[ent.start : ent.end]
            for t in doc:
                if t.text == text:
                    token = t
            if token:
                span = SpacySpan(doc, token.i, token.i + 1, label=ent.label)
        ents.append(span)
    doc.set_ents(ents)

    for sent in doc.sents:
        sent_doc = sent.as_doc()
        new_example = Example(
            text=sent.text,
            spans=[
                Span(
                    text=e.text,
                    start=e.start_char,
                    end=e.end_char,
                    token_start=e.start,
                    token_end=e.end,
                    label=e.label_,
                )
                for e in sent_doc.ents
            ],
            tokens=[
                Token(text=t.text, start=t.idx, end=t.idx + len(t.text), id=i)
                for i, t in enumerate(sent_doc)
            ],
        )
        new_examples.append(new_example)
    return new_examples

strip_annotations(example, *, strip_chars=['.', '!', '?', '-', ':', ' '], preprocessed_outputs={})

Strip punctuation and spaces from start and end of annotations. These characters are almost always a mistake and will confuse a model

Parameters:

Name Type Description Default
example Example

Input Example

required
strip_chars List[str]

Characters to strip.

['.', '!', '?', '-', ':', ' ']

Returns:

Name Type Description
Example Example

Example with stripped spans

Source code in recon/corrections.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
@operation("recon.strip_annotations.v1", pre=["recon.spacy.v1"], handles_tokens=False)
def strip_annotations(
    example: Example,
    *,
    strip_chars: List[str] = [".", "!", "?", "-", ":", " "],
    preprocessed_outputs: Dict[str, Any] = {},
) -> Example:
    """Strip punctuation and spaces from start and end of annotations.
    These characters are almost always a mistake and will confuse a model

    Args:
        example (Example): Input Example
        strip_chars (List[str], optional): Characters to strip.

    Returns:
        Example: Example with stripped spans
    """

    preprocessed_outputs["recon.spacy.v1"]

    for s in example.spans:
        for ch in strip_chars:
            if s.text.startswith(ch):
                ch = s.text[0]
                while ch in strip_chars:
                    s.text = s.text[1:]
                    s.start += 1
                    ch = s.text[0]
            elif s.text.endswith(ch):
                ch = s.text[-1]
                while ch in strip_chars:
                    s.text = s.text[:-1]
                    ch = s.text[-1]
                    s.end -= 1
    return example