Paperyard
ppyrd.namer.php
Go to the documentation of this file.
1 <?php
2  require_once('dbHandler.php');
3  require_once('ppyrd.base.php');
4 
16  class pdfNamer extends ppyrd {
17 
24  public function __construct($pdf, $db)
25  {
26  // cleaning the log
27  $this->log = "";
28 
29  // dont output debug information
30  $this->debug = false;
31 
32  // creating db handler to talk to DB
33  $this->db=$db;
34 
35  // old name equals new name in the beginning
36  $this->oldName=$pdf;
37 
38  // set new name only if it has not been applied already (e.g. a document is not fully recognized and rematched with updated DB entries)
40  if (!preg_match('(ddatum|ffirma|bbetreff|wwer|bbetrag)',$pdf) && !preg_match('(^\d{8}\s\-)',$pdf)) {
41  // getting new file name structure from database
42  $this->newName=$this->db->getConfigValue('newFilenameStructure');
43  // appending old file name - otherwise just add .pdf to the end
44  if ($db->getConfigValue('appendOldFilename')==1) {
45  $this->newName .= $pdf;
46  } else {
47  $this->newName .= ".pdf";
48  }
49 
50 
51  }
52  else {
53  $this->newName=$pdf;
54  }
55 
56  $this->companyName = "";
57  $this->subjectName = "";
58 
59  // standard tag if no tags are found
60  $this->tags = "[nt]";
61 
62 
63 
64  // what mimimum score is required until we accept the company as correct
65  $this->companyMatchRating = $this->db->getConfigValue("companyMatchRating");
66 
67  // what mimimum score is required until we accept the company as correct
68  $this->subjectMatchRating = $this->db->getConfigValue("subjectMatchRating");
69 
70  $this->dateRegEx = $this->db->getConfigValue("dateRegEx");
71 
72  // reads the pdf
73  $this->getTextFromPdf($pdf);
74  }
75 
76 
81  function getTextFromPdf($pdf) {
82  // reads content into $this->content
83  exec('pdftotext -layout "' . $pdf . '" -', $this->content);
84  //exec('pdftotext "' . $pdf . '"');
85  }
86 
93  function addToLog($str) {
94  $this->log .= $str . "\n";
95  }
96 
104  function toDate(&$item, $key) {
105  self::addtolog("Date found $item");
106  if (strpos($item, "im") !== false) {
107  $item =str_replace("im ","",$item);
108  $item = date("Ymt", strtotime($item));
109  } else {
110  $item = date("Ymd", strtotime($item));
111  }
112  }
113 
121  function closestDateToToday ($dates)
122  {
123  arsort($dates);
124  foreach ($dates as $date) {
125  if ($date<=date('Ymd'))
126  return $date;
127  }
128  return "ddatum";
129  }
130 
131 
137  function cleanContent()
138  {
139  // get everything into one long string
140  $this->content = implode(" ", $this->content);
141 
142  // convert everything to lowercase to avoid case sensitive mismatches
143  $this->content = strtolower($this->content);
144 
145  // todo: remove everything but digits and letters
146  $this->content = preg_replace($this->db->getConfigValue('stripCharactersFromContent'), " ", $this->content);
147 
148  // remove spaces if there is more than one (double space, tripple space etc.);
149  $this->content = preg_replace("/\s\s+/", " ", $this->content);
150 
151  //var_dump($this->content);
152  }
153 
154  //
160  function matchDates()
161  {
162  $this->addToLog('');
163  $this->addToLog('===');
164  $this->addToLog('LOOKING FOR DATES');
165 
166  // Datumsformate
167  preg_match_all ($this->dateRegEx, $this->content, $dates);
168 
169  // only consider full matches and remove duplicates
170  $dates = array_unique($dates[0]);
171 
172  // getting into YYYYmmdd format
173  array_walk($dates, 'self::toDate');
174  $dates = array_unique($dates);
175 
176  // most likely date found
177  $this->newDate = $this->closestDateToToday($dates);
178 
179  // changing date in fileName
180  $this->newName = str_replace("ddatum",$this->newDate, $this->newName);
181 
182  }
183 
184 
189  function replacePersonalVariables($searchTerm)
190  {
191  // getting all replacement terms
192  $results = $this->db->getPersonalVariables();
193 
194  // looping string thru all terms
195  while ($row = $results->fetchArray()) {
196  $searchTerm= str_replace($row['variableName'], $row['replaceWith'], $searchTerm);
197  }
198 
199 
200  // done
201  return $searchTerm;
202  }
203 
210  function matchSenders ()
211  {
212  // looking for active rules from database to check document against
213  $results = $this->db->getActiveSenders();
214  $company = array();
215  $tmpMatchedCompanyTags= array();
216 
217  // start matching search terms vs content
218  while ($row = $results->fetchArray()) {
219 
220  // checking if there are multiple search terms separated by a comma
221 
222  // start - just one searchterm
223  if (strpos($row['foundWords'], ",")=== false) {
224 
225  $searchTerms = strtolower($this->replacePersonalVariables($row['foundWords']));
226 
227  // checking if we found it at least once
228  if (substr_count($this->content, $searchTerms)>0) {
229  @$company[$row['fileCompany']] += $row['companyScore'];
230  // keeping a list of match hits for later tagging
231  $tmpMatchedCompanyTags[$row['fileCompany']][]=$row['tags'];
232 
233  }
234  } // end - just one search
235 
236 
237  // start - multiple search terms
238  else {
239 
240  // separating search terms and removing white spaces
241  $split = explode(',', strtolower($row['foundWords']));
242 
243  // break variable to stop in case one word was not found
244  $foundAll = true;
245  foreach ($split as $value) {
246  if($foundAll) {
247 
248  // removing any whitespace
249  $value = trim($this->replacePersonalVariables($value));
250 
251  // counting occurances
252  $cfound = substr_count($this->content, $value);
253 
254 
255  // setting stop variable since nothing was found
256  if ($cfound==0) {
257  $foundAll= false;
258  }
259  }
260  }
261 
262  // found all - lets write the result
263  if ($foundAll) {
264  @$company[$row['fileCompany']] += $row['companyScore'];
265  // keeping a list of match hits for later tagging
266  $tmpMatchedCompanyTags[$row['fileCompany']][]=$row['tags'];
267 
268  // writing log
269  $this->addToLog('"' . $row['foundWords'] . '" ' . " found - " . $row['companyScore'] . " points for company " . $row['fileCompany']);
270  }
271 
272  // not all found - thus no results to write
273  else {
274  }
275  }
276 
277  } // end - matching search terms vs content
278 
279 
280  // sorting so highest match is on top
281  arsort($company);
282 
283  if (isset($company[key($company)])) {
284  $companyMatchRating = $company[key($company)];
285  $this->companyName = key($company);
286  $this->matchedCompanyTags = $tmpMatchedCompanyTags[$this->companyName];
287 
288  // checking match ranking
289  $this->output("company: " . $this->companyName . " scored " . $companyMatchRating);
290 
291  if ($companyMatchRating >= $this->companyMatchRating) {
292  $this->newName = str_replace("ffirma",$this->companyName, $this->newName);
293  }
294  }
295 
296 
297 
298  }
299 
303  function matchPrice()
304  {
305  // matching all potential price mentions
306  preg_match_all($this->db->getConfigValue('matchPriceRegex'), $this->content, $results);
307 
308  // getting values of full match only
309  $prices = array_values($results[0]);
310 
311  // removing all non numeric characters except comma and period
312  $prices = preg_replace("/[^0-9,.]/", "", $prices);
313  $maxprice = 0;
314  foreach ($prices as $price) {
315  $price = floatval(str_replace(',','.',str_replace('.','', $price)));
316  if ($price > $maxprice) $maxprice = $price;
317  }
318 
319  // setting max price
320  $this->price=number_format($maxprice,2,",",".");
321 
322  $this->newName = str_replace("bbetrag","EUR".$this->price, $this->newName);
323 
324  $this->output("amount: EUR" . $this->price);
325 
326  }
327 
328 
332  function matchSubjects() {
333  // looking for active rules from database to check document against
334  $results = $this->db->getActiveSubjects();
335  $subject = array();
336 
337  $tmpMatchedSubjectTags = array();
338 
339  // start matching search terms vs content
340  while ($row = $results->fetchArray()) {
341 
342  // checking if the found company matches the company specified in subject rule
343  // also checking that it is not empty
344  @$tmpFoundCompany = trim($row['foundCompany']);
345  if ($tmpFoundCompany== $this->companyName || empty($tmpFoundCompany)) {
346 
347 
348  // checking if there are multiple search terms separated by a comma
349 
350  // start - just one searchterm
351  if (strpos($row['foundWords'], ",")=== false) {
352 
353  // checking if we found it at least once
354  $searchTerm = strtolower($this->replacePersonalVariables($row['foundWords']));
355  if (substr_count($this->content, $searchTerm)>0) {
356  @$subject[$row['fileSubject']] += $row['subjectScore'];
357 
358  // keeping a list of match hits for later tagging
359  $tmpMatchedSubjectTags[$row['fileSubject']][]=$row['tags'];
360  }
361  } // end - just one search
362 
363 
364  // start - multiple search terms
365  else {
366 
367  // separating search terms and removing white spaces
368  $split = explode(',', strtolower($row['foundWords']));
369 
370  // break variable to stop in case one word was not found
371  $foundAll = true;
372  foreach ($split as $value) {
373  if($foundAll) {
374 
375  // removing any whitespace
376  $value = trim($this->replacePersonalVariables($value));
377 
378  // counting occurances
379  $cfound = substr_count($this->content, $value);
380 
381 
382  // setting stop variable since nothing was found
383  if ($cfound==0) {
384  $foundAll= false;
385  }
386  }
387  }
388 
389  // found all - lets write the result
390  if ($foundAll) {
391  @$subject[$row['fileSubject']] += $row['subjectScore'];
392  // keeping a list of match hits for later tagging
393  $tmpMatchedSubjectTags[$row['fileSubject']][]=$row['tags'];
394  // writing log
395  $this->addToLog('"' . $row['foundWords'] . '" ' . " found - " . $row['subjectScore'] . " points for subject " . $row['fileSubject']);
396  }
397 
398  // not all found - thus no results to write
399  else {
400  }
401  }
402  } // end check if company name matches
403  } // end - matching search terms vs content
404 
405 
406  // sorting so highest match is on top
407  arsort($subject);
408 
409  @$subjectMatchRating = $subject[key($subject)];
410  $this->subjectName = key($subject);
411  @$this->matchedSubjectTags = $tmpMatchedSubjectTags[$this->subjectName];
412 
413  // checking match ranking
414  $this->output("subject: " . $this->subjectName . " scored " . $subjectMatchRating);
415 
416  if ($subjectMatchRating >= $this->subjectMatchRating) {
417  $this->newName = str_replace("bbetreff",$this->subjectName, $this->newName);
418  }
419 
420 
421 
422 
423  }
424 
425 
432  function matchRecipients() {
433  // looking for active rules from database to check document against
434  $results = $this->db->getActiveRecipients();
435  $recipients = array();
436 
437  // for each rule check if the name occures in the text.
438  while ($row = $results->fetchArray()) {
439  $cfound = substr_count($this->content, strtolower($row['recipientName']));
440  $this->output("look for " . $row['recipientName'] . " found $cfound", 1);
441  @$recipients[$row['shortNameForFile']] += $cfound;
442  }
443 
444  // sort the results alphabetically
445  asort($recipients);
446 
447  // kill all entries which have not been matched
448  foreach ($recipients as $name => $score) {
449  if ($score == 0) unset($recipients[$name]);
450  }
451 
452  // switch key & values => as we want to have the name and not the # of hits
453  // join all hits with a comma
454  $recipients = implode(',',array_flip($recipients));
455 
456  // write the new name
457  if (!empty($recipients))
458  $this->newName = str_replace("wwer",$recipients, $this->newName);
459  }
460 
464  function addTags() {
465  // tossing all tags into one array
466  @$alltags = array_merge($this->matchedCompanyTags, $this->matchedSubjectTags);
467 
468  // splitting up comma separated values and putting them back into the array
469  @$tags = explode(',',join(",", $alltags));
470 
471  // cleaning the tags
472  $cleantags = "";
473  foreach ($tags as $tag) {
474  $tag = trim($tag);
475  if (!empty($tag))
476  $cleantags[] = "[$tag]";
477  }
478 
479  // removing duplicates
480  if (is_array($cleantags)) {
481  $cleantags = array_unique($cleantags);
482 
483  // sorting tags
484  asort($cleantags);
485 
486  // joining them into one string
487  $this->tags=implode($cleantags);
488 
489  $this->output("tags: " . $this->tags);
490  }
491  else {
492  $this->output("tags: no tags to assign");
493  }
494 
495  // changing date in fileName only if tags are to assign
496  if (!empty($this->tags))
497  $this->newName = str_replace("[nt]",$this->tags, $this->newName);
498 
499  }
500 
501 
508  function run() {
509  // cleaning content of the PDF document
510  $this->cleanContent();
511 
512  // looking for dates in content
513  $this->matchDates();
514 
515  // matching rule sets from database
516  //$this->matchRules();
517 
518  // match recipients from database
519  $this->matchSenders();
520  // match recipients from database
521  $this->matchSubjects();
522 
523  // matching tags once company and subject are matched
524  $this->addTags();
525 
526 
527 
528  // match recipients from database
529  $this->matchRecipients();
530 
531  //
532  $this->matchPrice();
533 
534  // renaming the file in case everything matched
535  if (!preg_match('(ddatum|ffirma|bbetreff|wwer|bbetrag)',$this->newName)) {
536  exec('mv --backup=numbered "' . $this->oldName . '" "../outbox/' . $this->newName . '"');
537  }
538  else {
539  // dont move in case something is still unmatched
540  if ($this->oldName != $this->newName) {
541  exec('mv --backup=numbered "' . $this->oldName . '" "' . $this->newName . '"');
542  }
543  }
544 
545  $this->output("new name: " . $this->newName);
546 
547 
548  // logging everything to database
549  $this->db->writeLog($this->oldName, $this->newName, $this->content, $this->log);
550  }
551 
552  }
553 
554 
555 // main program
556 
561 $ppyrd = new ppyrd($db);
562 
563 // looping main directory and calling the pdf parser
564 $ppyrd->output("starting paperyard");
565 
566 
567 
571  $ppyrd->checkCliVsWebserver();
572 
573 
574 
575 // switching to working directory
576 chdir("/data/inbox");
577 
578 //loop all pdfs
579 $pdfs = glob("*.pdf");
580 foreach($pdfs as $pdf){
581  $pdf=new pdfNamer($pdf, $db);
582  $pdf->run();
583 }
584 
585 $db->close();
586 
587 ?>
getTextFromPdf($pdf)
function executes pdftotext to extract text from file
Definition: ppyrd.namer.php:81
addTags()
function adds tags once company and subject are correctly matched
matchSenders()
reads rulesets from database and executes accordingly
handling database connection and queries
Definition: dbHandler.php:15
run()
main function calling relevant process steps to identify document
$pdfs
toDate(&$item, $key)
converts a text string to a date. used for array walk in matchDates
__construct($pdf, $db)
constructor for the class
Definition: ppyrd.namer.php:24
matchSubjects()
matching subject
matchRecipients()
reads recipient list from database and tries to match in text
cleanContent()
takes the PDF content and cleans it up
matchDates()
looks regular expression dates in the content of the file
output($string, $debug=0)
outputs string
Definition: ppyrd.base.php:78
replacePersonalVariables($searchTerm)
replaces the personal variables within a string
closestDateToToday($dates)
takes an array of dates and returns the closest one before today (since Paper documents have dates in...
$ppyrd
takes care of the correct naming of files
Definition: ppyrd.namer.php:16
addToLog($str)
Definition: ppyrd.namer.php:93
matchPrice()
checks if there is a price in the text