diff --git a/docs/index.md b/docs/index.md index 5e678dba..2d5b619c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -58,6 +58,7 @@ - [FileMoverTask] - [FileReaderTask](reference/tasks/file_reader_task.md) - [FileRemoverTask] + - [FileSplitterTask](reference/tasks/file_splitter_task.md) - [FileWriterTask] - [FolderBrowserTask](reference/tasks/folder_browser_task.md) - [InputFileReaderTask](reference/tasks/input_file_reader_task.md) diff --git a/docs/reference/tasks/file_splitter_task.md b/docs/reference/tasks/file_splitter_task.md new file mode 100644 index 00000000..a652eb4b --- /dev/null +++ b/docs/reference/tasks/file_splitter_task.md @@ -0,0 +1,42 @@ +FileSplitterTask +============= + +Split long file into smaller ones + +Task reference +-------------- + +* **Service**: `CleverAge\ProcessBundle\Task\File\FileSplitterTask` +* **Iterable task** + +Accepted inputs +--------------- + +`array`: inputs are merged with task defined options. + +Possible outputs +---------------- + +`string`: absolute path of the produced file + +Options +------- + +| Code | Type | Required | Default | Description | +|-------------------------|-----------------|:--------:|----------|------------------------------------------| +| `file_path` | `string` | **X** | | Path of the file to read from (absolute) | +| `max_lines` | `int` | **X** | 1000 | Max number of line on a produced file | + +Example +------- + +```yaml +# Task configuration level +entry: + service: '@CleverAge\ProcessBundle\Task\File\FileSplitterTask' + options: + file_path: '%kernel.project_dir%/var/data/json_stream_reader.json' + max_lines: 1 +``` + + diff --git a/src/Filesystem/SplFile.php b/src/Filesystem/SplFile.php new file mode 100644 index 00000000..14f38c40 --- /dev/null +++ b/src/Filesystem/SplFile.php @@ -0,0 +1,106 @@ +file = new \SplFileObject($filename, $mode); + + // Useful to skip empty trailing lines (doesn't work well on PHP 8, see readLine() code) + $this->file->setFlags(null !== $splFileObjectFlags + ? array_sum($splFileObjectFlags) + : \SplFileObject::DROP_NEW_LINE | \SplFileObject::READ_AHEAD | \SplFileObject::SKIP_EMPTY + ); + } + + /** + * Warning! This method will rewind the file to the beginning before and after counting the lines! + */ + public function getLineCount(): int + { + if (null === $this->lineCount) { + $this->rewind(); + $line = 0; + while (!$this->isEndOfFile()) { + ++$line; + $this->file->next(); + } + $this->rewind(); + + $this->lineCount = $line; + } + + return $this->lineCount; + } + + public function getLineNumber(): int + { + return $this->lineNumber; + } + + public function isEndOfFile(): bool + { + return $this->file->eof(); + } + + /** + * Return an array containing current data and moving the file pointer. + */ + public function readLine(?int $length = null): ?string + { + if ($this->isEndOfFile()) { + return null; + } + + $rawLine = $this->file->fgets(); + // Fix issue on PHP 8 with empty line at the end, even if SKIP_EMPTY is set + if ('' === $rawLine) { + return null; + } + ++$this->lineNumber; + + return $rawLine; + } + + public function writeLine(string $data): int + { + $this->file->fwrite($data.\PHP_EOL); + ++$this->lineNumber; + + return $this->lineNumber; + } + + /** + * Rewind data to array. + */ + public function rewind(): void + { + $this->file->rewind(); + $this->lineNumber = 1; + } +} diff --git a/src/Task/File/FileSplitterTask.php b/src/Task/File/FileSplitterTask.php new file mode 100644 index 00000000..e87bc2e0 --- /dev/null +++ b/src/Task/File/FileSplitterTask.php @@ -0,0 +1,110 @@ +getMergedOptions($state); + $this->splFileObjectFlags = [\SplFileObject::READ_AHEAD, \SplFileObject::SKIP_EMPTY]; + if (!$this->file instanceof SplFile) { + $this->file = new SplFile($options['file_path'], 'rb', $this->splFileObjectFlags); + $this->lineCount = $this->file->getLineCount(); + } + + // Return a temporary file containing a limited number of lines + $splittedFilename = $this->splitFile($this->file, $options['max_lines']); + $state->setOutput($splittedFilename); + } + + /** + * Moves the internal pointer to the next element, + * return true if the task has a next element + * return false if the task has terminated it's iteration. + */ + public function next(ProcessState $state): bool + { + if (!$this->file instanceof SplFile) { + return false; + } + + // Fix issue on PHP 8 with empty line at the end, even if SKIP_EMPTY is set + $endOfFile = $this->file->isEndOfFile() || $this->file->getLineNumber() > $this->lineCount; + if ($endOfFile) { + $this->file = null; + } + + return !$endOfFile; + } + + protected function splitFile(SplFile $file, int $maxLines): string + { + $tmpFilePath = sys_get_temp_dir().\DIRECTORY_SEPARATOR.'php_'.uniqid('process', false).'.tmp'; + $splitFile = new SplFile($tmpFilePath, 'wb', $this->splFileObjectFlags); + + while ($splitFile->getLineNumber() <= $maxLines && !$file->isEndOfFile()) { + $line = $file->readLine(); + if ('' === $line || null === $line) { + continue; // This is probably an empty line, no harm to skip it + } + $splitFile->writeLine($line); + } + + return $tmpFilePath; + } + + protected function configureOptions(OptionsResolver $resolver): void + { + $resolver->setRequired(['file_path']); + $resolver->setAllowedTypes('file_path', ['string']); + $resolver->setDefaults([ + 'max_lines' => 1000, + ]); + $resolver->setAllowedTypes('max_lines', ['int']); + } + + /** + * @return array + */ + protected function getMergedOptions(ProcessState $state): array + { + /** @var array $options */ + $options = $this->getOptions($state); + + /** @var array|mixed $input */ + $input = $state->getInput() ?: []; + if (!\is_array($input)) { + $input = []; + } + // @var array $input + + return array_merge($options, $input); + } +}