([0-9]{1,4}\. .*?)\(.*? [A-Z]{2,60}.*?)\'Empty string','string'=>$candidate); }else if(mb_strlen(str_replace(' ','',$candidate))<5){ $rejected[] = array('reason'=>'Too short','string'=>$candidate); } else if(ctype_upper(str_replace(array(' ','(',')'),'',$candidate))){ //Reject because this is all capitals $rejected[] = array('reason'=>'All capital letters','string'=>$candidate); }else if(mb_strlen($candidate)>255){ //Reject because too long $rejected[] = array('reason'=>'Too long','string'=>$candidate); }else{ $candidate = strip_tags(html_entity_decode($candidate)); $candidate = trim($candidate); if(mb_strlen($candidate)<5){ $rejected[] = array('reason'=>'Too short','string'=>$candidate); }else{ $blacklisted = false; foreach($blacklist as $phrase){ if(mb_stripos($candidate,$phrase)!==FALSE){ $blacklisted = true; break; } } if($blacklisted){ $rejected[] = array('reason'=>'Contains blacklisted string','string'=>$candidate); } else $candidates[] = $candidate; } } } } foreach($candidates as $i=>&$candidate){ //Parse the candidate text /* Formats: Petr Olegovich AVEN (born on March 16, 1955) Oleksii Ivanovych KOVALEV (born on January 19, 1989) Vladimir BESPALOV Serhii Petrovych KLIUIEV, born on August 12, 1969, businessman, brother of Andrii Petrovych KLIUIEV Mikhail Leonidovich RODIKOV (born in 1958) */ if(mb_stripos($candidate,'(born on') !== FALSE){ $name = mb_substr($candidate,0,mb_stripos($candidate,'(born on')-1); $birthday = mb_substr($candidate,mb_stripos($candidate,'(born on')+1 + strlen('born on') + 1); $birthday = mb_substr($birthday,0,mb_stripos($birthday,')')); $candidate = array('candidate'=>$candidate,'name'=>$name,'birthday'=>$birthday,'birth_year' => mb_substr($birthday,-4)); }else if(mb_stripos($candidate,'(born in') !== FALSE){ $name = mb_substr($candidate,0,mb_stripos($candidate,'(born in')-1); $birthday = mb_substr($candidate,mb_stripos($candidate,'(born in')+1 + strlen('born in') + 1); $birthday = mb_substr($birthday,0,mb_stripos($birthday,')')); $candidate = array('candidate'=>$candidate,'name'=>$name,'birth_year'=>$birthday); }elseif(mb_stripos($candidate,', born on') !== FALSE){ $name = mb_substr($candidate,0,mb_stripos($candidate,', born on')-1); $birthday = mb_substr($candidate,mb_stripos($candidate,', born on')+1 + strlen('born on') + 1); $first_comma_pos = mb_stripos($birthday,','); $second_comma_pos = mb_stripos($birthday,',',$first_comma_pos+1); $birthday = mb_substr($birthday,0,$second_comma_pos); $comment = mb_substr($candidate,mb_strrpos($candidate,',')+2); $comment = ucfirst($comment); $candidate = array('candidate'=>$candidate,'name'=>$name,'birthday'=>$birthday,'birth_year' => mb_substr($birthday,-4),'comment'=>$comment); }else{ $candidate = array('candidate'=>$candidate); } if(!isset($candidate['name'])) $candidate['name']=""; if(!isset($candidate['birthday'])) $candidate['birthday']=""; if(!isset($candidate['birth_year'])) $candidate['birth_year']=""; if(!isset($candidate['comment'])) $candidate['comment']=""; //Reject invalid candidates that aren't parseable as names if(empty($candidate['name'])){ if(mb_strlen($candidate['candidate'])>100){ $rejected[] = array('reason'=>'Too long','string'=>$candidate); unset($candidates[$i]); } } } } return array( 'candidates'=>$candidates, 'rejected'=>$rejected ); } if(!is_dir(CACHE_FOLDER)){ die('You need to create the cache folder before running this to avoid hammering the Gazette server: '.CACHE_FOLDER); } $urls = get_gazettes(2022); if(DEBUG_MODE) print_r($urls); $examples = array(); $regulation_urls = array(); foreach($urls as $index_url){ $html = fetch_page($index_url); $dom = new DOMDocument(); @$dom->loadHTML($html); $links = $dom->getElementsByTagName('a'); foreach($links as $link){ $href = $link->getAttribute('href'); if(mb_stripos($link->nodeValue,'special economic')!==FALSE){ if(mb_strpos($href,'sor-dors')!==FALSE){ $url = mb_substr($index_url,0,mb_strrpos($index_url,'/')+1).$href; $regulation_urls[] = $url; } } } } if(DEBUG_MODE) print_r($regulation_urls); $examples = $regulation_urls; /* $examples = array( 'https://gazette.gc.ca/rp-pr/p2/2014/2014-03-26/html/sor-dors44-eng.html', 'https://www.international.gc.ca/world-monde/international_relations-relations_internationales/sanctions/ukraine_regulations-reglement6.aspx?lang=eng', 'https://gazette.gc.ca/rp-pr/p2/2022/2022-05-11/html/sor-dors84-eng.html' ); */ $candidates = array(); foreach($examples as $i=>$example_url){ $html = fetch_regulation($example_url); echo $example_url.' --> '; echo $i; echo "\n"; $r = extract_identities($html); print_r($r); if(!empty($r['candidates'])){ foreach($r['candidates'] as $row){ $candidates[] = $row; } } } if(DEBUG_MODE) print_r($candidates); file_put_contents(OUTPUT_FILE,json_encode($candidates)); echo "\nWrote ".count($candidates).' sanctioned entities to JSON file: '.OUTPUT_FILE."\n"; ?>