Skip to content

Corpus

Corpus

Container for a full Corpus with train/dev/test splits. Used to apply core functions to all datasets at once.

Source code in recon/corpus.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
class Corpus:
    """Container for a full Corpus with train/dev/test splits.
    Used to apply core functions to all datasets at once.
    """

    def __init__(
        self,
        name: str,
        train: Dataset,
        dev: Dataset,
        test: Optional[Dataset] = None,
        example_store: Optional[ExampleStore] = None,
    ):
        """Initialize a Corpus.

        Args:
            name (str): Name of the Corpus
            train (Dataset): Dataset containing examples for **train** set
            dev (Dataset): Dataset containing examples for **dev** set
            test (Optional[Dataset]): Optional Dataset containing examples
                for **test** set
            example_store (Optional[ExampleStore]): Optional ExampleStore
        """
        self._name = name
        if example_store is None:
            examples = train.data + dev.data
            if test:
                examples += test.data
            example_store = ExampleStore(examples)
        self._example_store = example_store

        if test is None:
            test = Dataset("test")

        for ds in (train, dev, test):
            ds.set_example_store(example_store)

        self._train = train
        self._dev = dev
        self._test = test

    @property
    def name(self) -> str:
        """Get Corpus name

        Returns:
            str: Corpus name
        """
        return self._name

    @property
    def train_ds(self) -> Dataset:
        """Return train dataset

        Returns:
            Dataset: Train Dataset
        """
        return self._train

    @property
    def dev_ds(self) -> Dataset:
        """Return dev dataset

        Returns:
            Dataset: Dev Dataset
        """
        return self._dev

    @property
    def test_ds(self) -> Dataset:
        """Return test dataset

        Returns:
            Dataset: Test Dataset
        """
        return self._test

    @property
    def train(self) -> List[Example]:
        """Return train dataset

        Returns:
            List[Example]: Train Examples
        """
        return self._train.data

    @property
    def dev(self) -> List[Example]:
        """Return dev dataset

        Returns:
            List[Example]: Dev Examples
        """
        return self._dev.data

    @property
    def test(self) -> List[Example]:
        """Return test dataset

        Returns:
            List[Example]: Test Examples
        """
        return self._test.data or []

    @property
    def all(self) -> List[Example]:
        """Return concatenation of train/dev/test datasets

        Returns:
            List[Example]: All Examples in Corpus
        """
        return self.train + self.dev + self.test

    @property
    def example_store(self) -> ExampleStore:
        return self._example_store

    def summary(self) -> str:
        summaries = [self.train_ds.summary(), self.dev_ds.summary()]
        if self.test_ds:
            summaries.append(self.test_ds.summary())
        return "\n".join(summaries)

    def print_summary(self) -> None:
        print(self.summary())

    def __str__(self) -> str:
        return self.summary()

    def apply(
        self, func: Union[str, StatsProtocol], *args: Any, **kwargs: Any
    ) -> CorpusApplyResult:
        """Apply a function to all datasets

        Args:
            func (Union[str, StatsProtocol]):
                Function that operates on a list of examples and returns
                some result. Useful for running the same stats operation
                for each dataset. If a str is provided, a function is resolved
                from the stat functions registry

        Returns:
            CorpusApplyResult: CorpusApplyResult mapping dataset
                name to return type of func Callable
        """

        return CorpusApplyResult(
            train=func(self.train, *args, **kwargs),  # type: ignore
            dev=func(self.dev, *args, **kwargs),  # type: ignore
            test=func(self.test, *args, **kwargs),  # type: ignore
            all=func(self.all, *args, **kwargs),  # type: ignore
        )

    def apply_(
        self, operation: Union[str, Operation], *args: Any, **kwargs: Any
    ) -> None:
        """Apply an operation to each Dataset via `Dataset.apply_`

        Args:
            operation (Union[str, Operation]): An Operation to modify the
                Dataset with.
            Defers to `Dataset.apply_`
        """
        self._train.apply_(operation, *args, **kwargs)
        self._dev.apply_(operation, *args, **kwargs)
        self._test.apply_(operation, *args, **kwargs)

    def pipe_(self, operations: List[Union[str, Operation]]) -> None:
        """Run a sequence of operations on each dataset.
        Calls Dataset.pipe_ for each dataset

        Args:
            operations (List[Union[str, OperationState]]): List of operations
        """
        self._train.pipe_(operations)
        self._dev.pipe_(operations)
        self._test.pipe_(operations)

    @classmethod
    def from_disk(
        cls,
        data_dir: Union[str, Path],
        name: str = "corpus",
        train_name: str = "train",
        dev_name: str = "dev",
        test_name: str = "test",
    ) -> "Corpus":
        """Load Corpus from disk given a directory with files
        named explicitly train.jsonl, dev.jsonl, and test.jsonl

        Args:
            data_dir (Path): directory to load from.
            train_name (str, optional): Name of train data under data_dir.
                Defaults to train.
            dev_name (str, optional): Name of dev data under data_dir.
                Defaults to dev.
            test_name (str, optional): Name of test data under data_dir.
                Defaults to test.
        """
        data_dir = ensure_path(data_dir)

        corpus_meta_path = data_dir / ".recon" / "meta.json"
        if corpus_meta_path.exists():
            corpus_meta = CorpusMeta.model_validate(srsly.read_json(corpus_meta_path))
            name = corpus_meta.name

        example_store_path = data_dir / ".recon" / "example_store.jsonl"
        example_store = ExampleStore()
        if example_store_path.exists():
            example_store.from_disk(example_store_path)

        train = Dataset(train_name, example_store=example_store).from_disk(data_dir)
        dev = Dataset(dev_name, example_store=example_store).from_disk(data_dir)

        try:
            test = Dataset(test_name, example_store=example_store).from_disk(data_dir)
            corpus = cls(name, train, dev, test=test)
        except ValueError:
            corpus = cls(name, train, dev)
        return corpus

    def to_disk(self, output_dir: Union[str, Path], overwrite: bool = False) -> None:
        """Save Corpus to Disk

        Args:
            output_dir (Path): Directory to save data to
            overwrite (bool): Force save to directory. Create parent directories
                and/or overwrite existing data.
        """
        data_dir = ensure_path(output_dir)
        state_dir = data_dir / ".recon"
        corpus_meta_path = state_dir / "meta.json"

        if not overwrite and data_dir.exists():
            raise ValueError(
                "Output directory is not empty. Set overwrite=True in Corpus.to_disk to"
                " clear the directory before saving."
            )

        data_dir.mkdir(parents=True, exist_ok=True)
        if not state_dir.exists():
            state_dir.mkdir(parents=True, exist_ok=True)

        srsly.write_json(corpus_meta_path, CorpusMeta(name=self.name).model_dump())
        self._train.to_disk(data_dir, overwrite=overwrite, save_examples=False)
        self._dev.to_disk(data_dir, overwrite=overwrite, save_examples=False)
        if self._test:
            self._test.to_disk(data_dir, overwrite=overwrite, save_examples=False)
        self.example_store.to_disk(state_dir / "example_store.jsonl")

    @classmethod
    def from_prodigy(
        cls,
        name: str,
        prodigy_train_datasets: List[str],
        prodigy_dev_datasets: List[str],
        prodigy_test_datasets: Optional[List[str]] = None,
    ) -> "Corpus":
        """Load a Corpus from 3 separate datasets in Prodigy

        Args:
            name: Corpus name
            prodigy_train_datasets (List[str]): Prodigy datasets to load
                as Recon train dataset
            prodigy_dev_datasets (List[str]): Prodigy datasets to load as
                Recon dev dataset
            prodigy_test_datasets (Optional[List[str]]): Prodigy datasets to load as
                Recon test dataset

        Returns:
            Corpus: Corpus initialized from prodigy datasets
        """
        train_ds = Dataset("train").from_prodigy(prodigy_train_datasets)
        dev_ds = Dataset("dev").from_prodigy(prodigy_dev_datasets)
        test_ds = (
            Dataset("test").from_prodigy(prodigy_test_datasets)
            if prodigy_test_datasets
            else None
        )

        ds = cls(name, train_ds, dev_ds, test_ds)
        return ds

    def to_prodigy(
        self,
        name: Optional[str] = None,
        prodigy_train_dataset: Optional[str] = None,
        prodigy_dev_dataset: Optional[str] = None,
        prodigy_test_dataset: Optional[str] = None,
        overwrite: bool = True,
    ) -> Tuple[str, str, str]:
        """Save a Corpus to 3 separate Prodigy datasets

        Args:
            name (Optional[str]): Name prefix for datasets in Prodigy
            prodigy_train_dataset (Optional[str]): Train dataset name in Prodigy
            prodigy_dev_dataset (Optional[str]): Dev dataset name in Prodigy
            prodigy_test_dataset (Optional[str]): Test dataset name in Prodigy
        """
        name = name if name else self.name

        if not prodigy_train_dataset:
            prodigy_train_dataset = f"{name}_train_{self.train_ds.commit_hash}"

        if not prodigy_dev_dataset:
            prodigy_dev_dataset = f"{name}_dev_{self.dev_ds.commit_hash}"

        if not prodigy_test_dataset:
            prodigy_test_dataset = f"{name}_test_{self.test_ds.commit_hash}"

        self.train_ds.to_prodigy(prodigy_train_dataset, overwrite=overwrite)
        self.dev_ds.to_prodigy(prodigy_dev_dataset, overwrite=overwrite)
        self.test_ds.to_prodigy(prodigy_test_dataset, overwrite=overwrite)

        return (prodigy_train_dataset, prodigy_dev_dataset, prodigy_test_dataset)

all: List[Example] property

Return concatenation of train/dev/test datasets

Returns:

Type Description
List[Example]

List[Example]: All Examples in Corpus

dev: List[Example] property

Return dev dataset

Returns:

Type Description
List[Example]

List[Example]: Dev Examples

dev_ds: Dataset property

Return dev dataset

Returns:

Name Type Description
Dataset Dataset

Dev Dataset

name: str property

Get Corpus name

Returns:

Name Type Description
str str

Corpus name

test: List[Example] property

Return test dataset

Returns:

Type Description
List[Example]

List[Example]: Test Examples

test_ds: Dataset property

Return test dataset

Returns:

Name Type Description
Dataset Dataset

Test Dataset

train: List[Example] property

Return train dataset

Returns:

Type Description
List[Example]

List[Example]: Train Examples

train_ds: Dataset property

Return train dataset

Returns:

Name Type Description
Dataset Dataset

Train Dataset

__init__(name, train, dev, test=None, example_store=None)

Initialize a Corpus.

Parameters:

Name Type Description Default
name str

Name of the Corpus

required
train Dataset

Dataset containing examples for train set

required
dev Dataset

Dataset containing examples for dev set

required
test Optional[Dataset]

Optional Dataset containing examples for test set

None
example_store Optional[ExampleStore]

Optional ExampleStore

None
Source code in recon/corpus.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def __init__(
    self,
    name: str,
    train: Dataset,
    dev: Dataset,
    test: Optional[Dataset] = None,
    example_store: Optional[ExampleStore] = None,
):
    """Initialize a Corpus.

    Args:
        name (str): Name of the Corpus
        train (Dataset): Dataset containing examples for **train** set
        dev (Dataset): Dataset containing examples for **dev** set
        test (Optional[Dataset]): Optional Dataset containing examples
            for **test** set
        example_store (Optional[ExampleStore]): Optional ExampleStore
    """
    self._name = name
    if example_store is None:
        examples = train.data + dev.data
        if test:
            examples += test.data
        example_store = ExampleStore(examples)
    self._example_store = example_store

    if test is None:
        test = Dataset("test")

    for ds in (train, dev, test):
        ds.set_example_store(example_store)

    self._train = train
    self._dev = dev
    self._test = test

apply(func, *args, **kwargs)

Apply a function to all datasets

Parameters:

Name Type Description Default
func Union[str, StatsProtocol]

Function that operates on a list of examples and returns some result. Useful for running the same stats operation for each dataset. If a str is provided, a function is resolved from the stat functions registry

required

Returns:

Name Type Description
CorpusApplyResult CorpusApplyResult

CorpusApplyResult mapping dataset name to return type of func Callable

Source code in recon/corpus.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def apply(
    self, func: Union[str, StatsProtocol], *args: Any, **kwargs: Any
) -> CorpusApplyResult:
    """Apply a function to all datasets

    Args:
        func (Union[str, StatsProtocol]):
            Function that operates on a list of examples and returns
            some result. Useful for running the same stats operation
            for each dataset. If a str is provided, a function is resolved
            from the stat functions registry

    Returns:
        CorpusApplyResult: CorpusApplyResult mapping dataset
            name to return type of func Callable
    """

    return CorpusApplyResult(
        train=func(self.train, *args, **kwargs),  # type: ignore
        dev=func(self.dev, *args, **kwargs),  # type: ignore
        test=func(self.test, *args, **kwargs),  # type: ignore
        all=func(self.all, *args, **kwargs),  # type: ignore
    )

apply_(operation, *args, **kwargs)

Apply an operation to each Dataset via Dataset.apply_

Parameters:

Name Type Description Default
operation Union[str, Operation]

An Operation to modify the Dataset with.

required
Source code in recon/corpus.py
166
167
168
169
170
171
172
173
174
175
176
177
178
def apply_(
    self, operation: Union[str, Operation], *args: Any, **kwargs: Any
) -> None:
    """Apply an operation to each Dataset via `Dataset.apply_`

    Args:
        operation (Union[str, Operation]): An Operation to modify the
            Dataset with.
        Defers to `Dataset.apply_`
    """
    self._train.apply_(operation, *args, **kwargs)
    self._dev.apply_(operation, *args, **kwargs)
    self._test.apply_(operation, *args, **kwargs)

from_disk(data_dir, name='corpus', train_name='train', dev_name='dev', test_name='test') classmethod

Load Corpus from disk given a directory with files named explicitly train.jsonl, dev.jsonl, and test.jsonl

Parameters:

Name Type Description Default
data_dir Path

directory to load from.

required
train_name str

Name of train data under data_dir. Defaults to train.

'train'
dev_name str

Name of dev data under data_dir. Defaults to dev.

'dev'
test_name str

Name of test data under data_dir. Defaults to test.

'test'
Source code in recon/corpus.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
@classmethod
def from_disk(
    cls,
    data_dir: Union[str, Path],
    name: str = "corpus",
    train_name: str = "train",
    dev_name: str = "dev",
    test_name: str = "test",
) -> "Corpus":
    """Load Corpus from disk given a directory with files
    named explicitly train.jsonl, dev.jsonl, and test.jsonl

    Args:
        data_dir (Path): directory to load from.
        train_name (str, optional): Name of train data under data_dir.
            Defaults to train.
        dev_name (str, optional): Name of dev data under data_dir.
            Defaults to dev.
        test_name (str, optional): Name of test data under data_dir.
            Defaults to test.
    """
    data_dir = ensure_path(data_dir)

    corpus_meta_path = data_dir / ".recon" / "meta.json"
    if corpus_meta_path.exists():
        corpus_meta = CorpusMeta.model_validate(srsly.read_json(corpus_meta_path))
        name = corpus_meta.name

    example_store_path = data_dir / ".recon" / "example_store.jsonl"
    example_store = ExampleStore()
    if example_store_path.exists():
        example_store.from_disk(example_store_path)

    train = Dataset(train_name, example_store=example_store).from_disk(data_dir)
    dev = Dataset(dev_name, example_store=example_store).from_disk(data_dir)

    try:
        test = Dataset(test_name, example_store=example_store).from_disk(data_dir)
        corpus = cls(name, train, dev, test=test)
    except ValueError:
        corpus = cls(name, train, dev)
    return corpus

from_prodigy(name, prodigy_train_datasets, prodigy_dev_datasets, prodigy_test_datasets=None) classmethod

Load a Corpus from 3 separate datasets in Prodigy

Parameters:

Name Type Description Default
name str

Corpus name

required
prodigy_train_datasets List[str]

Prodigy datasets to load as Recon train dataset

required
prodigy_dev_datasets List[str]

Prodigy datasets to load as Recon dev dataset

required
prodigy_test_datasets Optional[List[str]]

Prodigy datasets to load as Recon test dataset

None

Returns:

Name Type Description
Corpus Corpus

Corpus initialized from prodigy datasets

Source code in recon/corpus.py
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
@classmethod
def from_prodigy(
    cls,
    name: str,
    prodigy_train_datasets: List[str],
    prodigy_dev_datasets: List[str],
    prodigy_test_datasets: Optional[List[str]] = None,
) -> "Corpus":
    """Load a Corpus from 3 separate datasets in Prodigy

    Args:
        name: Corpus name
        prodigy_train_datasets (List[str]): Prodigy datasets to load
            as Recon train dataset
        prodigy_dev_datasets (List[str]): Prodigy datasets to load as
            Recon dev dataset
        prodigy_test_datasets (Optional[List[str]]): Prodigy datasets to load as
            Recon test dataset

    Returns:
        Corpus: Corpus initialized from prodigy datasets
    """
    train_ds = Dataset("train").from_prodigy(prodigy_train_datasets)
    dev_ds = Dataset("dev").from_prodigy(prodigy_dev_datasets)
    test_ds = (
        Dataset("test").from_prodigy(prodigy_test_datasets)
        if prodigy_test_datasets
        else None
    )

    ds = cls(name, train_ds, dev_ds, test_ds)
    return ds

pipe_(operations)

Run a sequence of operations on each dataset. Calls Dataset.pipe_ for each dataset

Parameters:

Name Type Description Default
operations List[Union[str, OperationState]]

List of operations

required
Source code in recon/corpus.py
180
181
182
183
184
185
186
187
188
189
def pipe_(self, operations: List[Union[str, Operation]]) -> None:
    """Run a sequence of operations on each dataset.
    Calls Dataset.pipe_ for each dataset

    Args:
        operations (List[Union[str, OperationState]]): List of operations
    """
    self._train.pipe_(operations)
    self._dev.pipe_(operations)
    self._test.pipe_(operations)

to_disk(output_dir, overwrite=False)

Save Corpus to Disk

Parameters:

Name Type Description Default
output_dir Path

Directory to save data to

required
overwrite bool

Force save to directory. Create parent directories and/or overwrite existing data.

False
Source code in recon/corpus.py
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
def to_disk(self, output_dir: Union[str, Path], overwrite: bool = False) -> None:
    """Save Corpus to Disk

    Args:
        output_dir (Path): Directory to save data to
        overwrite (bool): Force save to directory. Create parent directories
            and/or overwrite existing data.
    """
    data_dir = ensure_path(output_dir)
    state_dir = data_dir / ".recon"
    corpus_meta_path = state_dir / "meta.json"

    if not overwrite and data_dir.exists():
        raise ValueError(
            "Output directory is not empty. Set overwrite=True in Corpus.to_disk to"
            " clear the directory before saving."
        )

    data_dir.mkdir(parents=True, exist_ok=True)
    if not state_dir.exists():
        state_dir.mkdir(parents=True, exist_ok=True)

    srsly.write_json(corpus_meta_path, CorpusMeta(name=self.name).model_dump())
    self._train.to_disk(data_dir, overwrite=overwrite, save_examples=False)
    self._dev.to_disk(data_dir, overwrite=overwrite, save_examples=False)
    if self._test:
        self._test.to_disk(data_dir, overwrite=overwrite, save_examples=False)
    self.example_store.to_disk(state_dir / "example_store.jsonl")

to_prodigy(name=None, prodigy_train_dataset=None, prodigy_dev_dataset=None, prodigy_test_dataset=None, overwrite=True)

Save a Corpus to 3 separate Prodigy datasets

Parameters:

Name Type Description Default
name Optional[str]

Name prefix for datasets in Prodigy

None
prodigy_train_dataset Optional[str]

Train dataset name in Prodigy

None
prodigy_dev_dataset Optional[str]

Dev dataset name in Prodigy

None
prodigy_test_dataset Optional[str]

Test dataset name in Prodigy

None
Source code in recon/corpus.py
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
def to_prodigy(
    self,
    name: Optional[str] = None,
    prodigy_train_dataset: Optional[str] = None,
    prodigy_dev_dataset: Optional[str] = None,
    prodigy_test_dataset: Optional[str] = None,
    overwrite: bool = True,
) -> Tuple[str, str, str]:
    """Save a Corpus to 3 separate Prodigy datasets

    Args:
        name (Optional[str]): Name prefix for datasets in Prodigy
        prodigy_train_dataset (Optional[str]): Train dataset name in Prodigy
        prodigy_dev_dataset (Optional[str]): Dev dataset name in Prodigy
        prodigy_test_dataset (Optional[str]): Test dataset name in Prodigy
    """
    name = name if name else self.name

    if not prodigy_train_dataset:
        prodigy_train_dataset = f"{name}_train_{self.train_ds.commit_hash}"

    if not prodigy_dev_dataset:
        prodigy_dev_dataset = f"{name}_dev_{self.dev_ds.commit_hash}"

    if not prodigy_test_dataset:
        prodigy_test_dataset = f"{name}_test_{self.test_ds.commit_hash}"

    self.train_ds.to_prodigy(prodigy_train_dataset, overwrite=overwrite)
    self.dev_ds.to_prodigy(prodigy_dev_dataset, overwrite=overwrite)
    self.test_ds.to_prodigy(prodigy_test_dataset, overwrite=overwrite)

    return (prodigy_train_dataset, prodigy_dev_dataset, prodigy_test_dataset)