Skip to content

Tokenization

add_tokens(example, *, use_spacy_token_ends=False, preprocessed_outputs)

Add tokens to each Example

Parameters:

Name Type Description Default
example Example

Input Example

required
preprocessed_outputs Dict[str, Any]

Outputs of preprocessors

required

Returns:

Name Type Description
Example Optional[Example]

Example with tokens

Source code in recon/tokenization.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
@operation("recon.add_tokens.v1", pre=["recon.spacy.v1"])
def add_tokens(
    example: Example,
    *,
    use_spacy_token_ends: bool = False,
    preprocessed_outputs: Dict[str, Any],
) -> Optional[Example]:
    """Add tokens to each Example

    Args:
        example (Example): Input Example
        preprocessed_outputs (Dict[str, Any]): Outputs of preprocessors

    Returns:
        Example: Example with tokens
    """
    doc = preprocessed_outputs["recon.spacy.v1"]

    tokens = []
    token_starts = {}
    token_ends = {}

    for t in doc:
        start = t.idx
        end = t.idx + len(t)
        tokens.append(Token(text=t.text, start=start, end=end, id=t.i))
        token_starts[start] = t
        token_ends[end] = t

    example.tokens = tokens

    for span in example.spans:
        if span.start in token_starts and span.end in token_ends:
            span.token_start = token_starts[span.start].i
            if use_spacy_token_ends:
                span.token_end = token_ends[span.end].i + 1
            else:
                span.token_end = token_ends[span.end].i

        if span.token_start is None or span.token_end is None:
            return None

    return example