Sostegno/2022/Microgrant WMI/Relazione conclusiva/script-import-export.php
Versione del 6 giu 2022 alle 12:32 di System.bot (discussione | contributi) (export from /home/www-data/landscapefor/atlas/beta/cli/wikidata-import-export.php)
This is the AtlasFor's Wikidata import-export command line interface script.
It's designed to:
- list all AtlasFor point of interests, by last import date first
- chunk them 50 at time, to then do a single HTTP call to the Wikidata
wbgetentities
API- this is done using the boz-mw framework using
Wikidata::instance()->fetch()
- this is done using the boz-mw framework using
- update our local Wikidata identifiers with any upstream Wikidata redirect (fixing merges)
- export our label in Wikidata, only if one is missing
- export our short description in Wikidata, only if one is missing
- export our geolocation in Wikidata, only if one is missing and if it's not an area
- if the preferred geolocation is NOT the one from AtlasFor, investigate why
- allow to push or pull a better geolocation from/in Wikidata
- if the preferred geolocation is NOT the one from AtlasFor, investigate why
- import Wikipedia editions (in some languages)
- import Wikimedia Commons link
- import official website (keeping language), and other identifiers
- export our street address, only if one is missing and if it's not an area
- import it, if we have not it
- export our local identifier (https://www.wikidata.org/wiki/Property:P7004) if we saved at least 1 thing, or if our POI is important
/home/www-data/landscapefor/atlas/beta/cli/wikidata-import-export.php
#!/usr/bin/php <?php # Landscapefor-map - The "Landscapefor" map management system # Copyright (C) 2018, 2019, 2020, 2021, 2022 Valerio Bozzolan # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. // allowed only from command line interface if( !isset( $argv[ 0 ] ) ) { exit( 1 ); } // autoload suckless-php require __DIR__ . '/../load.php'; // autoload boz-php require BOZ_MW; // declare usage of useful classes use \wm\Wikidata; use \wb\StatementExternalID; use \wb\StatementGlobeCoordinate; use \wb\StatementMonolingualText; use \wb\Label; use \wb\Reference; use \wb\SnakURL; use \wb\SnakTime; use \wb\SnakItem; // command line arguments $opts = getopt( '', [ 'poiid:', 'no-export', 'no-import-messages', 'no-just-identifiers', 'upgrade-coordinates', ] ); $LANGUAGE_PREFERENCES = [ 'it', 'en', ]; // export option $POIID = $opts['poiid'] ?? null; $EXPORT = !isset( $opts['no-export'] ); $NO_IMPORT = isset( $opts['no-import-messages'] ); $NO_JUST_ID = isset( $opts['no-just-identifiers'] ); $UP_COORDS = isset( $opts['upgrade-coordinates'] ); // when a single POI is selected, try to upgrade the coordinates if( $POIID ) { $UP_COORDS = true; } echo "--poiid specified: " . ( $POIID ? $POIID : 'no' ) . "\n"; echo "--no-export specified: " . ( $EXPORT ? 'no' : 'yes' ) . "\n"; echo "--upgrade-coordinates specified: " . ( $UP_COORDS ? 'yes' : 'no' ) . "\n"; echo "--no-import-messages specified: " . ( $NO_IMPORT ? 'yes' : 'no' ) . "\n"; echo "--no-just-identifiers specified: " . ( $NO_JUST_ID ? 'yes' : 'no' ) . "\n"; // to avoid to save addresses like 'Torino' or other shitty stuff $MIN_LENGTH_ADDRESS = 14; $WIKIDATA_LANGUAGES = [ 'Q1860' => 'en', 'Q652' => 'it', 'Q150' => 'fr', 'Q7850' => 'zh', 'Q8798' => 'uk', ]; // properties to be queried from Wikidata APIs $ENTITY_PROPS = [ 'labels', 'descriptions', 'claims', ]; // get the externalsites that can be extracted from a Wikidata property $importable_identifiers = ( new ExternalsiteApi() ) ->joinExternalsitecategory() ->whereWikidataPropertyIsNotNULL() ->queryResults(); $POI_FIELDS = [ POI::ID, POI::NAME, POI::LAT, POI::LNG, POI::WIKIDATA, POI::ADDRESS, POI::CAPTION, POI::EXTERNAL_URLS, POI::COMUNE_, Comune::NAME, ]; $only_published_medias = function( $query ) { $query->joinMedia(); $query->whereMediaIsPublished(); }; $pois = new QueryPOI(); $pois->defaultClass( POIFull::class ); $pois->joinComune( 'LEFT') ; $pois->select( $POI_FIELDS ); $pois->selectPOIMediaCount( $only_published_medias ); if( $POIID ) { $pois->wherePOIID( $POIID ); } else { $pois->whereStr( POI::STATUS, 'published' ); } $pois->compare( POI::WIKIDATA, 'IS NOT', 'NULL' ); // first POIs that was never marked as imported $pois->orderBy( 'poi_lastwikidataimport', 'ASC' ); $wikidata = Wikidata::instance() ->login( WIKIMEDIA_BOT_USERNAME, WIKIMEDIA_BOT_PASSWORD ); $poi_generator = $pois->queryGenerator(); // process 50 at time foreach( chunk_generator( $poi_generator, 50 ) as $poi_chunk ) { // process few of them $poi_by_wikidata_id = []; foreach( $poi_chunk as $poi ) { // note that Wikidata codes can be empty if( $poi->get( POI::WIKIDATA ) ) { $poi_by_wikidata_id[ $poi->get( POI::WIKIDATA ) ] = $poi; } } $entities = []; try { $entities = $wikidata->fetch( [ 'action' => 'wbgetentities', 'ids' => array_keys( $poi_by_wikidata_id ), 'props' => $ENTITY_PROPS, ] ); } catch( mw\API\NoSuchEntityException $e ) { \cli\Log::error( $e->getMessage() ); sleep( 5 ); } // it should be just one foreach( $entities->entities ?? [] as $entity ) { $entity_id = $entity->id; $new_entity_id = null; if( isset( $entity->redirects ) ) { $entity_id = $entity->redirects->from; $new_entity_id = $entity->redirects->to; } $poi = $poi_by_wikidata_id[ $entity_id ]; if( !$poi ) { var_dump( $entity ); die( "not found entity\n" ); } $need_to_refresh_poi = false; $entity_data = $wikidata->createDataModelFromObject( $entity ); // allow to refresh the POI later do { $poi_ID = $poi->getPOIID(); echo "Analyzing POI {$poi_ID}\n"; // check if this POI need to be refreshed (maybe this is the second loop) if( $need_to_refresh_poi ) { // pull data from AtlasFor $poi = ( new QueryPOI() ) ->select( $POI_FIELDS ) ->selectPOIMediaCount( $only_published_medias ) ->joinComune( 'LEFT' ) ->wherePOI( $poi ) ->defaultClass( POIFull::class ) ->queryRow(); // pull most recent Wikidata ID $entity_id = $poi->get( POI::WIKIDATA ); // pull data from Wikidata $entity_data = $wikidata->fetchSingleEntity( $entity_id, [ 'props' => $ENTITY_PROPS, ] ); $need_to_refresh_poi = false; } // empty container for information to be saved $new_data = $entity_data->cloneEmpty(); // wikidata standard reference $reference = new Reference( [ // reference URL: AtlasFor new SnakURL( 'P854', "https://atlasf.eu/poi/$poi_ID" ), // retrieved point in time: now new SnakTime( 'P813' ), ] ); // update redirected Wikidata ID if( $poi && $new_entity_id ) { update_poi( $poi, [ new DBCol( POI::WIKIDATA, $new_entity_id, 's' ), ] ); } $summary = []; $imports = []; // export italian label if( !$entity_data->hasLabelInLanguage( 'it' ) ) { $label = new Label( 'it', $poi->getPOIName() ); $new_data->setLabel( $label ); $summary[] = "add label it: " . $poi->getPOIName(); } // export English label if( !$entity_data->hasLabelInLanguage( 'en' ) ) { $label = new Label( 'en', $poi->getPOIName() ); $new_data->setLabel( $label ); $summary[] = "add label en: " . $poi->getPOIName(); } // for each importable identifier (from a Wikidata property like Official website) foreach( $importable_identifiers as $importable_identifier ) { // this may be official website, Facebook, etc. $wikidata_property = $importable_identifier->getWikidataProperty(); $identifier_name = $importable_identifier->getExternalsitecategoryName(); // import official website or whatever official external URL (take all claims) foreach( $entity_data->getClaimsInProperty( $wikidata_property ) as $claim ) { // index the websites by the language $sites_by_lang = []; // try to associate to a language $site = $claim->getMainsnak()->getDataValue()->getValue(); if( $site ) { // read 'language of work or name' to identify the language foreach( $claim->getQualifiersInProperty( 'P407' ) as $qualifier ) { $lang = $qualifier->getDataValue()->getValue()[ 'id' ]; if( $lang ) { if( isset( $WIKIDATA_LANGUAGES[ $lang ] ) ) { $sites_by_lang[ $WIKIDATA_LANGUAGES[ $lang ] ] = $site; } } } // case for no language specified if( !$sites_by_lang ) { $sites_by_lang['none'] = $site; } } // save official website foreach( $sites_by_lang as $lang => $site ) { // query the externalsite (in that language) $externalsite_query = new ExternalsiteApi(); $externalsite_query->whereExternalsitecategory( $importable_identifier ); if( $lang === 'none' ) { $externalsite_query->whereExternalsiteLangIsNULL(); } else { $externalsite_query->whereExternalsiteLang( $lang ); } $externalsite = $externalsite_query->queryRow(); if( !$externalsite ) { error_die( "missing site for {$identifier_name}[lang:$lang]" ); } /* * If the italian language is found, delete the website without language that is less precise * This is useful when: * - Today I import an official website without language * - Tomorrow I import that official website in Italian (someone specified the language of name or work) */ if( $lang === 'it' ) { // check if it exists a version of this identifier without language $neutral_externalsite = ( new ExternalsiteApi() ) ->whereExternalsiteCategory( $importable_identifier ) ->whereExternalsiteLangIsNULL() ->queryRow(); // delete the neutral entries (if any) if( $neutral_externalsite ) { $neutral_externalsitepage_query = ( new ExternalsitepageApi() ) ->whereExternalsite( $neutral_externalsite ) ->wherePOI( $poi ); if( $neutral_externalsitepage_query->queryRow() ) { $imports[] = "delete imprecise version of $identifier_name\n"; $neutral_externalsitepage_query->delete(); } } } else { $externalsite_query->whereExternalsiteLang( $lang ); } // find existing external site page $externalsitepage = ( new ExternalsitepageApi() ) ->whereExternalsite( $externalsite ) ->wherePOI( $poi ) ->queryRow(); // append a new one if( $externalsitepage ) { if( $claim->isDeprecated() ) { // if is deprecated, drop ( new ExternalsitepageApi() ) ->whereExternalsitepage( $externalsitepage ) ->delete(); $imports[] = "deleted now deprecated {$identifier_name}[lang:$lang]"; } } else { // import only if it's not deprecated if( !$claim->isDeprecated() ) { ( new ExternalsitepageApi() ) ->insertRow( [ new DBCol( 'externalsite_ID', $externalsite->getExternalsiteID(), 'd' ), new DBCol( 'externalsitepage_uid', $site, 's' ), new DBCol( 'poi_ID', $poi_ID, 'd' ), new DBCOl( 'externalsitepage_lastedit_date', 'NOW()', '-' ), new DBCol( 'externalsitepage_lastedit_user', 1, 'd' ), ] ); $imports[] = "import {$identifier_name}[lang:$lang]: '$site'"; } } } } } $is_area = $entity_data->hasClaimsInProperty( 'P159' ) // headquarters location || $entity_data->hasClaimsInProperty( 'P1332' ) // coordinates of northernmost point || $entity_data->hasClaimsInProperty( 'P1333' ) // coordinates of southernmost point || $entity_data->hasClaimsInProperty( 'P1334' ) // coordinates of easternmost point || $entity_data->hasClaimsInProperty( 'P1335' ) // coordinates of westernmost point || $entity_data->hasClaimsInProperty( 'P1376' ) // capital of || $entity_data->hasClaimsInProperty( 'P1082' ) // population || $entity_data->hasClaimsInProperty( 'P2046' ) // area (km) ; // eventually add coordinates if( !$is_area ) { $all_coordinates = $entity_data->getClaimsInProperty( 'P625' ); // at the moment we assume that we have better coordinates ONLY if there are not $we_have_better_coordinates = false; if( !count( $all_coordinates ) ) { $we_have_better_coordinates = true; } // check if we already saved some coordinates foreach( $entity_data->getClaimsInProperty( 'P625' ) as $claim_coordinates ) { $found_our_claim_reference = find_claim_reference_url( $claim_coordinates, 'https://atlasf.eu/' ); // check if we can upgrade coordinates (since they differs, on AtlasFor are probably more recent) if( $found_our_claim_reference && $UP_COORDS ) { // TODO: do something to understand if: // this is our: automatic $upstream_coordinates_raw = $claim_coordinates->getMainsnak()->getDataValue()->getValue(); $upstream_coordinates_raw_lat = $upstream_coordinates_raw['latitude']; $upstream_coordinates_raw_lng = $upstream_coordinates_raw['longitude']; if( has_poi_different_coordinates( $poi, $upstream_coordinates_raw_lat, $upstream_coordinates_raw_lng ) ) { // show this coordinates echo "Upstream coordinates with rank " . $claim_coordinates->getRank() . "\n"; echo GeoTools::geohackURL( [ 'lat' => $upstream_coordinates_raw_lat, 'lng' => $upstream_coordinates_raw_lng, ] ); echo "\n\n"; echo "Current coordinates:\n"; echo $poi->getPOIGeohackURL() . "\n"; echo "\n"; $we_have_better_coordinates_input = readline( "Is the new better? [Y/n]\n" ); $we_have_better_coordinates_input = strtolower( $we_have_better_coordinates_input ); if( $we_have_better_coordinates_input !== 'n' ) { $we_have_better_coordinates = true; if( true ) { // mark for removal $claim_coordinates->markForRemoval(); $new_data->addClaim( $claim_coordinates ); $summary[] = "remove obsolete [[Property:P625|coordinate location]]"; } else { // set this as obsolete $claim_coordinates->setRankDeprecated(); $claim_coordinates->addQualifier( new SnakItem( 'P2241', 'Q107356532' ) ); $new_data->addClaim( $claim_coordinates ); $summary[] = "obsolete [[Property:P625|coordinate location]]"; } } } else { echo "Upstream up to date\n"; } } } if( $we_have_better_coordinates ) { // coordinate location $statement = new StatementGlobeCoordinate( 'P625', $poi->getPOILatitude(), $poi->getPOILongitude(), 0.01 ); // append coordinates $new_data->addClaim( $statement->addReference( $reference ) ); $summary[] = "add [[Property:P625|coordinate location]]"; echo "Our location:\n"; echo $poi->getPOIGeohackURL() . "\n"; } } // address import-export $address = $poi->get( POI::ADDRESS ); if( $address ) { if( !$is_area ) { // export address if( strlen( $address ) > $MIN_LENGTH_ADDRESS && !$entity_data->hasClaimsInProperty( 'P6375' ) ) { $statement = new StatementMonolingualText( 'P6375', 'it', $address ); $new_data->addClaim( $statement->addReference( $reference ) ); $summary[] = "add [[Property:P6375|address]]: $address"; } } } else { $new_address = null; // import address from Wikidata foreach( $entity_data->getBestClaimsInProperty( 'P6375' ) as $claim ) { $value = $claim->getMainSnak()->getDataValue()->getValue(); if( $value['language'] === 'it' ) { $new_address = $value['text']; } } if( $new_address ) { $imports[] = "import address: $new_address"; update_poi( $poi, [ new DBCol( POI::ADDRESS, $new_address, 's' ), ] ); } } // reset for each POI $add_identifier = null; // check if we already have a local identifier foreach( $entity_data->getBestClaimsInProperty( WIKIDATA_PROPERTY ) as $claim ) { $identifier_upstream = $claim->getMainSnak()->getDataValue()->getValue(); $to = ( new QueryPOIRedirect() ) ->select( POIFull::fieldsForPermalink() ) ->wherePOIRedirectFromID( $identifier_upstream ) ->joinPOIRedirectTo() ->joinCategory() ->queryRow(); // redirect found? if( $to ) { $claim->setRankDeprecated(); // reason for deprecated rank: withdrawn identifier value $claim->addQualifier( new SnakItem( 'P2241', 'Q21441764' ) ); // prepare for saving $new_data->addClaim( $claim ); $summary[] = "deprecate [[Property:" . WIKIDATA_PROPERTY. "|AtlasFor ID]]: $identifier_upstream"; } else { // check if already present if( $identifier_upstream == $poi_ID ) { $add_identifier = false; } } } // check if it has sense to save the identifier if( $add_identifier !== false ) { // if we have more changes if( $new_data->getClaims() ) { // this is not the only change, save the identifier $add_identifier = true; } else { // this is the only change // check if it still has sense $poi_medias = $poi->count_medias ?? 0; if( !$NO_JUST_ID && $poi_medias > 2 ) { $add_identifier = true; } } } // add the AtlasFor identifier to Wikidata if( $add_identifier ) { $statement = new StatementExternalID( WIKIDATA_PROPERTY, (string) $poi_ID ); $new_data->addClaim( $statement ); $summary[] = "add [[Property:" . WIKIDATA_PROPERTY. "|AtlasFor ID]]: $poi_ID"; } // show some imported informations if( $imports && !$NO_IMPORT ) { echo "Import\n\t"; echo implode( "\n\t", $imports ); echo "\n"; } // have we any change to be saved? if( $summary ) { // is this export mode? if( $EXPORT ) { // edit summary $summary_raw = implode( "; ", $summary ); // $summary_raw .= " from [[Q65769786|AtlasFor]] {$poi->getPOIPermalinkShort()}"; // show some info echo "\n"; echo "\n"; echo "POI {$poi->getPOIPermalinkShort()}\n"; echo " " . $poi->getPOIName() . "\n"; echo " " . $poi->get( Comune::NAME ) . "\n"; echo " " . $poi->get( POI::CAPTION ) . "\n"; echo "WIKIDATA https://www.wikidata.org/wiki/{$entity->id}\n"; echo " " . $entity_data->getWhateverLabelValue( $LANGUAGE_PREFERENCES ) . "\n"; echo " " . $entity_data->getWhateverDescriptionValue( $LANGUAGE_PREFERENCES ) . "\n"; echo "SUMMARY:\n"; foreach( $summary as $summary_line ) { echo " $summary_line\n"; } // show something interesting // TODO: the DataValue are not specific, so the getPrintableWikitext() does not work // $new_data->printChanges(); //print_r( $new_data->get() ); // user interaction $interaction = readline( "Save? [Y/n/r/d] (Yes/no/refresh/dropID from DB)" ); $interaction = strtolower( $interaction ); if( $interaction === 'r' ) { $need_to_refresh_poi = true; } elseif( $interaction == 'd' ) { echo "Dropping Wikidata ID from our POI...\n"; // drop Wikidata ID ( new QueryPOI() ) ->wherePOIID( $poi_ID ) ->update( [ POI::WIKIDATA => null, ] ); } elseif( $interaction !== 'n' ) { echo "Saving...\n"; try { $new_data->editEntity( [ 'summary' => $summary_raw, 'bot' => 1, ] ); } catch( \wb\API\ModificationFailedException $e ) { \cli\Log::warn( $e->getMessage() ); sleep( 2 ); } catch( \mw\API\PermissionDeniedException $e ) { \cli\Log::error( $e->getMessage() ); sleep( 2 ); } } else { echo "Skipped manually...\n"; } } else { echo "Skipped (not in export mode)...\n"; } } } while( $need_to_refresh_poi ); // mark this POI as just seen ( new QueryPOI() ) ->wherePOIID( $poi_ID ) ->update( [ new DBCol( 'poi_lastwikidataimport', 'NOW()', '-' ), ] ); } } /** * A small shortcut to update a single POI * * @param POI $poi * @param array $data */ function update_poi( $poi, $data ) { $data[] = new DBCol( POI::LASTEDIT_USER, 1, 'd' ); $data[] = new DBCol( POI::LASTEDIT_DATE, 'NOW()', '-' ); ( new QueryPOI() ) ->wherePOI( $poi ) ->update( $data ); } /** * Given lot of results, split them in small chunks * * @param array $entries * @param int $n * @generator */ function chunk_generator( $entries, $n ) { $chunk = []; foreach( $entries as $entry ) { $chunk[] = $entry; if( count( $chunk ) > $n ) { yield $chunk; $chunk = []; } } if( $chunk ) { yield $chunk; } } /** * Check if a Claim contains a reference URL * * @param Claim $claim * @param string $url_part * @return Snak|false */ function find_claim_reference_url( $claim, $url_part ) { $references = $claim->getReferences(); foreach( $references as $reference ) { // reference URLs $snaks = $reference->getSnaksInProperty( 'P854' ); foreach( $snaks as $snak ) { $snak_datavalue = $snak->getDataValue(); $snak_value = $snak_datavalue->getValue(); if( strpos( $snak_value, $url_part ) !== -1 ) { return $reference; } } } return false; } /** * Compare POI's coordinates with some others * * @param POI $poi * @param float $lat2 * @param float $lng2 */ function has_poi_different_coordinates( $poi, $lat2, $lng2 ) { $lat1 = $poi->getPOILatitude(); $lng1 = $poi->getPOILongitude(); return are_different_coordinates( $lat1, $lng1, $lat2, $lng2 ); } /** * Check if two coordinates are very very similar * * NOTE: The numbers are considered the same if their truncation is the same. * * @param float $lat1 Latitude (or longitude) * @param float $lat2 Latitude (or longitude) * @return boolean */ function are_same_coordinates( $lat1, $lat2 ) { $minlen = min( strlen( $lat1 ), strlen( $lat2 ) ); $lat1 = substr( $lat1, 0, $minlen ); $lat2 = substr( $lat2, 0, $minlen ); return "$lat1" === "$lat2"; } /** * Check if two pairs of coordinates are different * * @param float $lat1 Latitude (1) * @param float $lng1 Longitude (1) * @param float $lat2 Latitude (2) * @param float $lng2 Longitude (2) * @return boolean */ function are_different_coordinates( $lat1, $lng1, $lat2, $lng2 ) { return !are_same_coordinates( $lat1, $lat2 ) || !are_same_coordinates( $lng1, $lng2 ); }