1 : <?php
2 :
3 : /**
4 : * Dictionaries of Old French and Latin
5 : *
6 : * PHP 5
7 : *
8 : * @category DicFro
9 : * @package Model
10 : * @subpackage Parser
11 : * @author Michel Corne <mcorne@yahoo.com>
12 : * @copyright 2008-2010 Michel Corne
13 : * @license http://opensource.org/licenses/gpl-3.0.html GNU General Public License, version 3 (GPLv3)
14 : * @link http://www.micmap.org/dicfro
15 : */
16 :
17 : require_once 'Base/String.php';
18 :
19 : /**
20 : * Dictionary parser
21 : *
22 : * @category DicFro
23 : * @package Model
24 : * @subpackage Parser
25 : * @author Michel Corne <mcorne@yahoo.com>
26 : * @copyright 2008-2010 Michel Corne
27 : * @license http://opensource.org/licenses/gpl-3.0.html GNU General Public License, version 3 (GPLv3)
28 : */
29 :
30 : abstract class Model_Parser
31 : {
32 : const BATCH_FILE_TEMPLATE = '../data/word.sql';
33 : const BATCH_FILE_TEMP = '../data/temp.sql';
34 :
35 : const BATCH_FILE_DEFAULT = '../data/%s/word.sql';
36 : const DATABASE_DEFAULT = '../data/%s/dictionary.sqlite';
37 : const DATA_FILE_DEFAULT = '../data/%s/word.txt';
38 : const ERROR_FILE_DEFAULT = '../data/%s/error.txt';
39 :
40 : public $dictionary;
41 :
42 : public $batchFile;
43 : public $dataBase;
44 : public $dataFile;
45 : public $errorFile;
46 : public $sourceFile;
47 : public $verbose;
48 :
49 : public $error;
50 : public $string;
51 :
52 : public function __construct($verbose = false)
53 : {
54 0 : $this->verbose = (bool)$verbose;
55 :
56 0 : $this->batchFile or $this->batchFile = sprintf(self::BATCH_FILE_DEFAULT, $this->dictionary);
57 0 : $this->dataBase or $this->dataBase = sprintf(self::DATABASE_DEFAULT, $this->dictionary);
58 0 : $this->dataFile or $this->dataFile = sprintf(self::DATA_FILE_DEFAULT, $this->dictionary);
59 0 : $this->errorFile or $this->errorFile = sprintf(self::ERROR_FILE_DEFAULT, $this->dictionary);
60 :
61 0 : settype($this->batchFile, 'array');
62 0 : settype($this->dataFile, 'array');
63 :
64 0 : $this->string = new Base_String;
65 0 : }
66 :
67 : public function __destruct()
68 : {
69 0 : if ($this->errorFile and $this->error) {
70 0 : $error = empty($this->error)? '' : implode('', $this->error);
71 :
72 0 : print "writing {$this->errorFile} ... ";
73 0 : @file_put_contents($this->errorFile, $error);
74 0 : print "done";
75 : }
76 0 : }
77 :
78 : public function create($lineStart = null, $lineCount = null)
79 : {
80 0 : $this->preProcessing();
81 : // reads and parses the dictionary
82 0 : list($lines, $lineStart) = $this->read($lineStart, $lineCount);
83 0 : $data = $this->parse($lines, $lineStart);
84 0 : $data = $this->postProcessing($data);
85 : // writes and imports the dictionary data files into the database
86 0 : $this->write($data);
87 0 : $this->import();
88 0 : }
89 :
90 : public function createBatchFile()
91 : {
92 0 : $template = file_get_contents(self::BATCH_FILE_TEMPLATE) or
93 0 : $this->error("cannot read " . self::BATCH_FILE_TEMPLATE, true);
94 :
95 0 : $content = sprintf($template, $this->dictionary);
96 :
97 0 : file_put_contents(self::BATCH_FILE_TEMP, $content) or
98 0 : $this->error("cannot write " . self::BATCH_FILE_TEMP, true);
99 0 : }
100 :
101 : public function error($message, $isError, $lineNumber = null, $verbose = false)
102 : {
103 0 : $errorType = $isError? 'Error' : 'Warning';
104 :
105 0 : $string = "\n$errorType! ";
106 0 : is_null($lineNumber) or $string .= "({$this->sourceFile} #$lineNumber) ";
107 0 : $string .= "$message\n";
108 :
109 0 : ($isError or $verbose or $this->verbose) and print $string ;
110 :
111 0 : empty($this->errorFile) or $this->error[] = $string;
112 :
113 0 : $isError and exit(1);
114 0 : }
115 :
116 : public function import()
117 : {
118 : // creates the data base
119 0 : print "creating database {$this->dataBase} ... \n" ;
120 :
121 0 : foreach($this->batchFile as $name) {
122 0 : print "reading $name ... " ;
123 :
124 0 : if (!($isBatchFile = file_exists($name))) {
125 0 : $this->createBatchFile();
126 0 : $name = self::BATCH_FILE_TEMP;
127 : }
128 :
129 0 : $command = "echo .read $name | sqlite3 {$this->dataBase}";
130 0 : if (stripos(PHP_OS, 'win') !== false) {
131 0 : $command = preg_replace('~\w:~', '' , $command);
132 0 : $command = str_replace('\\', '/' , $command);
133 : }
134 0 : $lineCount = exec($command, $ouput, $returnVar);
135 0 : $returnVar and $this->error("cannot execute $name (error: $returnVar)", true);
136 :
137 0 : is_numeric($lineCount) or
138 0 : $this->error("cannot import via $name (error: $lineCount)", true);
139 :
140 0 : print "$lineCount lines imported\n";
141 :
142 0 : $isBatchFile or unlink(self::BATCH_FILE_TEMP);
143 0 : }
144 0 : }
145 :
146 : public function isEndOfData($line)
147 : {
148 0 : return false;
149 : }
150 :
151 : public function isLineIgnored($line)
152 : {
153 0 : return false;
154 : }
155 :
156 : public function parse($lines, $lineNumber)
157 : {
158 : // parses the dictionary
159 0 : print "parsing {$this->sourceFile} ";
160 :
161 0 : $data = array_fill_keys(array_keys($this->dataFile), '');
162 :
163 0 : foreach($lines as $line) {
164 : // parses the line, adds the lines to the data
165 0 : $parsed = $this->parseLine($line, $lineNumber);
166 : // settype($parsed, 'array'); always an array!
167 :
168 0 : foreach($parsed as $name => $string) {
169 0 : empty($string) or $data[$name] .= $string . "\n";
170 0 : }
171 :
172 0 : $lineNumber++;
173 0 : $lineNumber % 1000 or print '.';
174 0 : }
175 :
176 0 : print ' ' . count($lines) . " lines parsed\n";
177 :
178 0 : return $data;
179 : }
180 :
181 0 : abstract public function parseLine($line, $lineNumber);
182 :
183 : public function preProcessing()
184 : {
185 0 : }
186 :
187 : public function postProcessing($data)
188 : {
189 0 : return $data;
190 : }
191 :
192 : public function read($lineStart = null, $lineCount = null)
193 : {
194 : // reads the dictionary
195 0 : print "reading {$this->sourceFile} ... ";
196 :
197 0 : $lines = @file($this->sourceFile) or
198 0 : $this->error("cannot read or empty file {$this->sourceFile}", true);
199 0 : print count($lines) . " lines read\n";
200 :
201 0 : empty($lineStart) and $lineStart = 1;
202 0 : empty($lineCount) and $lineCount = 99999;
203 :
204 0 : if ($lineStart !== 1 or $lineCount !== 99999) {
205 : // slices the dictionary (used only for debugging purposes)
206 0 : print "slicing {$this->sourceFile} ... ";
207 0 : $lines = array_slice($lines, $lineStart - 1, $lineCount);
208 0 : print count($lines) . " lines sliced\n";
209 : }
210 :
211 0 : return array($lines, $lineStart);
212 : }
213 :
214 : public function validateWordOrder($word, $lineNumber)
215 : {
216 : // validating the word order helps spots invalid entries where entries are expected
217 : // to be sorted in the source file, ex. gdf like Txt files
218 : // it should not be used for dictionaries that are not sorted, ex. ghostwords
219 :
220 0 : static $prevWord = null;
221 :
222 0 : is_null($prevWord) or $prevWord <= $word or
223 0 : $this->error("bad word order: $prevWord > $word", true, $lineNumber);
224 :
225 0 : $prevWord = $word;
226 0 : }
227 :
228 : public function write($data)
229 : {
230 : // writes the dictionary data file
231 0 : foreach($data as $name => $string) {
232 0 : $file = $this->dataFile[$name];
233 :
234 0 : print "writing data file $file ... " ;
235 :
236 0 : $bytesCount = @file_put_contents($file, $string) or
237 0 : $this->error("cannot write file $file", true);
238 :
239 0 : print "done\n";
240 0 : }
241 0 : }
|