Delimiter.php
4.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
<?php
namespace PhpOffice\PhpSpreadsheet\Reader\Csv;
class Delimiter
{
protected const POTENTIAL_DELIMETERS = [',', ';', "\t", '|', ':', ' ', '~'];
/** @var resource */
protected $fileHandle;
/** @var string */
protected $escapeCharacter;
/** @var string */
protected $enclosure;
/** @var array */
protected $counts = [];
/** @var int */
protected $numberLines = 0;
/** @var ?string */
protected $delimiter;
/**
* @param resource $fileHandle
*/
public function __construct($fileHandle, string $escapeCharacter, string $enclosure)
{
$this->fileHandle = $fileHandle;
$this->escapeCharacter = $escapeCharacter;
$this->enclosure = $enclosure;
$this->countPotentialDelimiters();
}
public function getDefaultDelimiter(): string
{
return self::POTENTIAL_DELIMETERS[0];
}
public function linesCounted(): int
{
return $this->numberLines;
}
protected function countPotentialDelimiters(): void
{
$this->counts = array_fill_keys(self::POTENTIAL_DELIMETERS, []);
$delimiterKeys = array_flip(self::POTENTIAL_DELIMETERS);
// Count how many times each of the potential delimiters appears in each line
$this->numberLines = 0;
while (($line = $this->getNextLine()) !== false && (++$this->numberLines < 1000)) {
$this->countDelimiterValues($line, $delimiterKeys);
}
}
protected function countDelimiterValues(string $line, array $delimiterKeys): void
{
$splitString = str_split($line, 1);
if (is_array($splitString)) {
$distribution = array_count_values($splitString);
$countLine = array_intersect_key($distribution, $delimiterKeys);
foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
$this->counts[$delimiter][] = $countLine[$delimiter] ?? 0;
}
}
}
public function infer(): ?string
{
// Calculate the mean square deviations for each delimiter
// (ignoring delimiters that haven't been found consistently)
$meanSquareDeviations = [];
$middleIdx = floor(($this->numberLines - 1) / 2);
foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
$series = $this->counts[$delimiter];
sort($series);
$median = ($this->numberLines % 2)
? $series[$middleIdx]
: ($series[$middleIdx] + $series[$middleIdx + 1]) / 2;
if ($median === 0) {
continue;
}
$meanSquareDeviations[$delimiter] = array_reduce(
$series,
function ($sum, $value) use ($median) {
return $sum + ($value - $median) ** 2;
}
) / count($series);
}
// ... and pick the delimiter with the smallest mean square deviation
// (in case of ties, the order in potentialDelimiters is respected)
$min = INF;
foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
if (!isset($meanSquareDeviations[$delimiter])) {
continue;
}
if ($meanSquareDeviations[$delimiter] < $min) {
$min = $meanSquareDeviations[$delimiter];
$this->delimiter = $delimiter;
}
}
return $this->delimiter;
}
/**
* Get the next full line from the file.
*
* @return false|string
*/
public function getNextLine()
{
$line = '';
$enclosure = ($this->escapeCharacter === '' ? ''
: ('(?<!' . preg_quote($this->escapeCharacter, '/') . ')'))
. preg_quote($this->enclosure, '/');
do {
// Get the next line in the file
$newLine = fgets($this->fileHandle);
// Return false if there is no next line
if ($newLine === false) {
return false;
}
// Add the new line to the line passed in
$line = $line . $newLine;
// Drop everything that is enclosed to avoid counting false positives in enclosures
$line = preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/Us', '', $line);
// See if we have any enclosures left in the line
// if we still have an enclosure then we need to read the next line as well
} while (preg_match('/(' . $enclosure . ')/', $line ?? '') > 0);
return $line ?? false;
}
}