Differenze tra le versioni di "Sostegno/2022/Microgrant WMI/Relazione conclusiva/script-import-export.php"
Vai alla navigazione
Vai alla ricerca
(export from /home/www-data/landscapefor/atlas/beta/cli/wikidata-import-export.php) |
(Nessuna differenza)
|
Versione attuale delle 12:32, 6 giu 2022
This is the AtlasFor's Wikidata import-export command line interface script.
It's designed to:
- list all AtlasFor point of interests, by last import date first
- chunk them 50 at time, to then do a single HTTP call to the Wikidata
wbgetentitiesAPI- this is done using the boz-mw framework using
Wikidata::instance()->fetch()
- this is done using the boz-mw framework using
- update our local Wikidata identifiers with any upstream Wikidata redirect (fixing merges)
- export our label in Wikidata, only if one is missing
- export our short description in Wikidata, only if one is missing
- export our geolocation in Wikidata, only if one is missing and if it's not an area
- if the preferred geolocation is NOT the one from AtlasFor, investigate why
- allow to push or pull a better geolocation from/in Wikidata
- if the preferred geolocation is NOT the one from AtlasFor, investigate why
- import Wikipedia editions (in some languages)
- import Wikimedia Commons link
- import official website (keeping language), and other identifiers
- export our street address, only if one is missing and if it's not an area
- import it, if we have not it
- export our local identifier (https://www.wikidata.org/wiki/Property:P7004) if we saved at least 1 thing, or if our POI is important
/home/www-data/landscapefor/atlas/beta/cli/wikidata-import-export.php
#!/usr/bin/php
<?php
# Landscapefor-map - The "Landscapefor" map management system
# Copyright (C) 2018, 2019, 2020, 2021, 2022 Valerio Bozzolan
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
// allowed only from command line interface
if( !isset( $argv[ 0 ] ) ) {
exit( 1 );
}
// autoload suckless-php
require __DIR__ . '/../load.php';
// autoload boz-php
require BOZ_MW;
// declare usage of useful classes
use \wm\Wikidata;
use \wb\StatementExternalID;
use \wb\StatementGlobeCoordinate;
use \wb\StatementMonolingualText;
use \wb\Label;
use \wb\Reference;
use \wb\SnakURL;
use \wb\SnakTime;
use \wb\SnakItem;
// command line arguments
$opts = getopt( '', [
'poiid:',
'no-export',
'no-import-messages',
'no-just-identifiers',
'upgrade-coordinates',
] );
$LANGUAGE_PREFERENCES = [
'it',
'en',
];
// export option
$POIID = $opts['poiid'] ?? null;
$EXPORT = !isset( $opts['no-export'] );
$NO_IMPORT = isset( $opts['no-import-messages'] );
$NO_JUST_ID = isset( $opts['no-just-identifiers'] );
$UP_COORDS = isset( $opts['upgrade-coordinates'] );
// when a single POI is selected, try to upgrade the coordinates
if( $POIID ) {
$UP_COORDS = true;
}
echo "--poiid specified: " . ( $POIID ? $POIID : 'no' ) . "\n";
echo "--no-export specified: " . ( $EXPORT ? 'no' : 'yes' ) . "\n";
echo "--upgrade-coordinates specified: " . ( $UP_COORDS ? 'yes' : 'no' ) . "\n";
echo "--no-import-messages specified: " . ( $NO_IMPORT ? 'yes' : 'no' ) . "\n";
echo "--no-just-identifiers specified: " . ( $NO_JUST_ID ? 'yes' : 'no' ) . "\n";
// to avoid to save addresses like 'Torino' or other shitty stuff
$MIN_LENGTH_ADDRESS = 14;
$WIKIDATA_LANGUAGES = [
'Q1860' => 'en',
'Q652' => 'it',
'Q150' => 'fr',
'Q7850' => 'zh',
'Q8798' => 'uk',
];
// properties to be queried from Wikidata APIs
$ENTITY_PROPS = [
'labels',
'descriptions',
'claims',
];
// get the externalsites that can be extracted from a Wikidata property
$importable_identifiers =
( new ExternalsiteApi() )
->joinExternalsitecategory()
->whereWikidataPropertyIsNotNULL()
->queryResults();
$POI_FIELDS = [
POI::ID,
POI::NAME,
POI::LAT,
POI::LNG,
POI::WIKIDATA,
POI::ADDRESS,
POI::CAPTION,
POI::EXTERNAL_URLS,
POI::COMUNE_,
Comune::NAME,
];
$only_published_medias = function( $query ) {
$query->joinMedia();
$query->whereMediaIsPublished();
};
$pois = new QueryPOI();
$pois->defaultClass( POIFull::class );
$pois->joinComune( 'LEFT') ;
$pois->select( $POI_FIELDS );
$pois->selectPOIMediaCount( $only_published_medias );
if( $POIID ) {
$pois->wherePOIID( $POIID );
} else {
$pois->whereStr( POI::STATUS, 'published' );
}
$pois->compare( POI::WIKIDATA, 'IS NOT', 'NULL' );
// first POIs that was never marked as imported
$pois->orderBy( 'poi_lastwikidataimport', 'ASC' );
$wikidata = Wikidata::instance()
->login( WIKIMEDIA_BOT_USERNAME, WIKIMEDIA_BOT_PASSWORD );
$poi_generator = $pois->queryGenerator();
// process 50 at time
foreach( chunk_generator( $poi_generator, 50 ) as $poi_chunk ) {
// process few of them
$poi_by_wikidata_id = [];
foreach( $poi_chunk as $poi ) {
// note that Wikidata codes can be empty
if( $poi->get( POI::WIKIDATA ) ) {
$poi_by_wikidata_id[ $poi->get( POI::WIKIDATA ) ] = $poi;
}
}
$entities = [];
try {
$entities = $wikidata->fetch( [
'action' => 'wbgetentities',
'ids' => array_keys( $poi_by_wikidata_id ),
'props' => $ENTITY_PROPS,
] );
} catch( mw\API\NoSuchEntityException $e ) {
\cli\Log::error( $e->getMessage() );
sleep( 5 );
}
// it should be just one
foreach( $entities->entities ?? [] as $entity ) {
$entity_id = $entity->id;
$new_entity_id = null;
if( isset( $entity->redirects ) ) {
$entity_id = $entity->redirects->from;
$new_entity_id = $entity->redirects->to;
}
$poi = $poi_by_wikidata_id[ $entity_id ];
if( !$poi ) {
var_dump( $entity );
die( "not found entity\n" );
}
$need_to_refresh_poi = false;
$entity_data = $wikidata->createDataModelFromObject( $entity );
// allow to refresh the POI later
do {
$poi_ID = $poi->getPOIID();
echo "Analyzing POI {$poi_ID}\n";
// check if this POI need to be refreshed (maybe this is the second loop)
if( $need_to_refresh_poi ) {
// pull data from AtlasFor
$poi = ( new QueryPOI() )
->select( $POI_FIELDS )
->selectPOIMediaCount( $only_published_medias )
->joinComune( 'LEFT' )
->wherePOI( $poi )
->defaultClass( POIFull::class )
->queryRow();
// pull most recent Wikidata ID
$entity_id = $poi->get( POI::WIKIDATA );
// pull data from Wikidata
$entity_data = $wikidata->fetchSingleEntity( $entity_id, [
'props' => $ENTITY_PROPS,
] );
$need_to_refresh_poi = false;
}
// empty container for information to be saved
$new_data = $entity_data->cloneEmpty();
// wikidata standard reference
$reference = new Reference( [
// reference URL: AtlasFor
new SnakURL( 'P854', "https://atlasf.eu/poi/$poi_ID" ),
// retrieved point in time: now
new SnakTime( 'P813' ),
] );
// update redirected Wikidata ID
if( $poi && $new_entity_id ) {
update_poi( $poi, [
new DBCol( POI::WIKIDATA, $new_entity_id, 's' ),
] );
}
$summary = [];
$imports = [];
// export italian label
if( !$entity_data->hasLabelInLanguage( 'it' ) ) {
$label = new Label( 'it', $poi->getPOIName() );
$new_data->setLabel( $label );
$summary[] = "add label it: " . $poi->getPOIName();
}
// export English label
if( !$entity_data->hasLabelInLanguage( 'en' ) ) {
$label = new Label( 'en', $poi->getPOIName() );
$new_data->setLabel( $label );
$summary[] = "add label en: " . $poi->getPOIName();
}
// for each importable identifier (from a Wikidata property like Official website)
foreach( $importable_identifiers as $importable_identifier ) {
// this may be official website, Facebook, etc.
$wikidata_property = $importable_identifier->getWikidataProperty();
$identifier_name = $importable_identifier->getExternalsitecategoryName();
// import official website or whatever official external URL (take all claims)
foreach( $entity_data->getClaimsInProperty( $wikidata_property ) as $claim ) {
// index the websites by the language
$sites_by_lang = [];
// try to associate to a language
$site = $claim->getMainsnak()->getDataValue()->getValue();
if( $site ) {
// read 'language of work or name' to identify the language
foreach( $claim->getQualifiersInProperty( 'P407' ) as $qualifier ) {
$lang = $qualifier->getDataValue()->getValue()[ 'id' ];
if( $lang ) {
if( isset( $WIKIDATA_LANGUAGES[ $lang ] ) ) {
$sites_by_lang[ $WIKIDATA_LANGUAGES[ $lang ] ] = $site;
}
}
}
// case for no language specified
if( !$sites_by_lang ) {
$sites_by_lang['none'] = $site;
}
}
// save official website
foreach( $sites_by_lang as $lang => $site ) {
// query the externalsite (in that language)
$externalsite_query = new ExternalsiteApi();
$externalsite_query->whereExternalsitecategory( $importable_identifier );
if( $lang === 'none' ) {
$externalsite_query->whereExternalsiteLangIsNULL();
} else {
$externalsite_query->whereExternalsiteLang( $lang );
}
$externalsite = $externalsite_query->queryRow();
if( !$externalsite ) {
error_die( "missing site for {$identifier_name}[lang:$lang]" );
}
/*
* If the italian language is found, delete the website without language that is less precise
* This is useful when:
* - Today I import an official website without language
* - Tomorrow I import that official website in Italian (someone specified the language of name or work)
*/
if( $lang === 'it' ) {
// check if it exists a version of this identifier without language
$neutral_externalsite = ( new ExternalsiteApi() )
->whereExternalsiteCategory( $importable_identifier )
->whereExternalsiteLangIsNULL()
->queryRow();
// delete the neutral entries (if any)
if( $neutral_externalsite ) {
$neutral_externalsitepage_query =
( new ExternalsitepageApi() )
->whereExternalsite( $neutral_externalsite )
->wherePOI( $poi );
if( $neutral_externalsitepage_query->queryRow() ) {
$imports[] = "delete imprecise version of $identifier_name\n";
$neutral_externalsitepage_query->delete();
}
}
} else {
$externalsite_query->whereExternalsiteLang( $lang );
}
// find existing external site page
$externalsitepage = ( new ExternalsitepageApi() )
->whereExternalsite( $externalsite )
->wherePOI( $poi )
->queryRow();
// append a new one
if( $externalsitepage ) {
if( $claim->isDeprecated() ) {
// if is deprecated, drop
( new ExternalsitepageApi() )
->whereExternalsitepage( $externalsitepage )
->delete();
$imports[] = "deleted now deprecated {$identifier_name}[lang:$lang]";
}
} else {
// import only if it's not deprecated
if( !$claim->isDeprecated() ) {
( new ExternalsitepageApi() )
->insertRow( [
new DBCol( 'externalsite_ID', $externalsite->getExternalsiteID(), 'd' ),
new DBCol( 'externalsitepage_uid', $site, 's' ),
new DBCol( 'poi_ID', $poi_ID, 'd' ),
new DBCOl( 'externalsitepage_lastedit_date', 'NOW()', '-' ),
new DBCol( 'externalsitepage_lastedit_user', 1, 'd' ),
] );
$imports[] = "import {$identifier_name}[lang:$lang]: '$site'";
}
}
}
}
}
$is_area =
$entity_data->hasClaimsInProperty( 'P159' ) // headquarters location
||
$entity_data->hasClaimsInProperty( 'P1332' ) // coordinates of northernmost point
||
$entity_data->hasClaimsInProperty( 'P1333' ) // coordinates of southernmost point
||
$entity_data->hasClaimsInProperty( 'P1334' ) // coordinates of easternmost point
||
$entity_data->hasClaimsInProperty( 'P1335' ) // coordinates of westernmost point
||
$entity_data->hasClaimsInProperty( 'P1376' ) // capital of
||
$entity_data->hasClaimsInProperty( 'P1082' ) // population
||
$entity_data->hasClaimsInProperty( 'P2046' ) // area (km)
;
// eventually add coordinates
if( !$is_area ) {
$all_coordinates = $entity_data->getClaimsInProperty( 'P625' );
// at the moment we assume that we have better coordinates ONLY if there are not
$we_have_better_coordinates = false;
if( !count( $all_coordinates ) ) {
$we_have_better_coordinates = true;
}
// check if we already saved some coordinates
foreach( $entity_data->getClaimsInProperty( 'P625' ) as $claim_coordinates ) {
$found_our_claim_reference = find_claim_reference_url( $claim_coordinates, 'https://atlasf.eu/' );
// check if we can upgrade coordinates (since they differs, on AtlasFor are probably more recent)
if( $found_our_claim_reference && $UP_COORDS ) {
// TODO: do something to understand if:
// this is our: automatic
$upstream_coordinates_raw = $claim_coordinates->getMainsnak()->getDataValue()->getValue();
$upstream_coordinates_raw_lat = $upstream_coordinates_raw['latitude'];
$upstream_coordinates_raw_lng = $upstream_coordinates_raw['longitude'];
if( has_poi_different_coordinates( $poi, $upstream_coordinates_raw_lat, $upstream_coordinates_raw_lng ) ) {
// show this coordinates
echo "Upstream coordinates with rank " . $claim_coordinates->getRank() . "\n";
echo GeoTools::geohackURL( [
'lat' => $upstream_coordinates_raw_lat,
'lng' => $upstream_coordinates_raw_lng,
] );
echo "\n\n";
echo "Current coordinates:\n";
echo $poi->getPOIGeohackURL() . "\n";
echo "\n";
$we_have_better_coordinates_input = readline( "Is the new better? [Y/n]\n" );
$we_have_better_coordinates_input = strtolower( $we_have_better_coordinates_input );
if( $we_have_better_coordinates_input !== 'n' ) {
$we_have_better_coordinates = true;
if( true ) {
// mark for removal
$claim_coordinates->markForRemoval();
$new_data->addClaim( $claim_coordinates );
$summary[] = "remove obsolete [[Property:P625|coordinate location]]";
} else {
// set this as obsolete
$claim_coordinates->setRankDeprecated();
$claim_coordinates->addQualifier( new SnakItem( 'P2241', 'Q107356532' ) );
$new_data->addClaim( $claim_coordinates );
$summary[] = "obsolete [[Property:P625|coordinate location]]";
}
}
} else {
echo "Upstream up to date\n";
}
}
}
if( $we_have_better_coordinates ) {
// coordinate location
$statement = new StatementGlobeCoordinate( 'P625',
$poi->getPOILatitude(),
$poi->getPOILongitude(),
0.01
);
// append coordinates
$new_data->addClaim( $statement->addReference( $reference ) );
$summary[] = "add [[Property:P625|coordinate location]]";
echo "Our location:\n";
echo $poi->getPOIGeohackURL() . "\n";
}
}
// address import-export
$address = $poi->get( POI::ADDRESS );
if( $address ) {
if( !$is_area ) {
// export address
if( strlen( $address ) > $MIN_LENGTH_ADDRESS && !$entity_data->hasClaimsInProperty( 'P6375' ) ) {
$statement = new StatementMonolingualText( 'P6375', 'it', $address );
$new_data->addClaim( $statement->addReference( $reference ) );
$summary[] = "add [[Property:P6375|address]]: $address";
}
}
} else {
$new_address = null;
// import address from Wikidata
foreach( $entity_data->getBestClaimsInProperty( 'P6375' ) as $claim ) {
$value = $claim->getMainSnak()->getDataValue()->getValue();
if( $value['language'] === 'it' ) {
$new_address = $value['text'];
}
}
if( $new_address ) {
$imports[] = "import address: $new_address";
update_poi( $poi, [
new DBCol( POI::ADDRESS, $new_address, 's' ),
] );
}
}
// reset for each POI
$add_identifier = null;
// check if we already have a local identifier
foreach( $entity_data->getBestClaimsInProperty( WIKIDATA_PROPERTY ) as $claim ) {
$identifier_upstream = $claim->getMainSnak()->getDataValue()->getValue();
$to =
( new QueryPOIRedirect() )
->select( POIFull::fieldsForPermalink() )
->wherePOIRedirectFromID( $identifier_upstream )
->joinPOIRedirectTo()
->joinCategory()
->queryRow();
// redirect found?
if( $to ) {
$claim->setRankDeprecated();
// reason for deprecated rank: withdrawn identifier value
$claim->addQualifier( new SnakItem( 'P2241', 'Q21441764' ) );
// prepare for saving
$new_data->addClaim( $claim );
$summary[] = "deprecate [[Property:" . WIKIDATA_PROPERTY. "|AtlasFor ID]]: $identifier_upstream";
} else {
// check if already present
if( $identifier_upstream == $poi_ID ) {
$add_identifier = false;
}
}
}
// check if it has sense to save the identifier
if( $add_identifier !== false ) {
// if we have more changes
if( $new_data->getClaims() ) {
// this is not the only change, save the identifier
$add_identifier = true;
} else {
// this is the only change
// check if it still has sense
$poi_medias = $poi->count_medias ?? 0;
if( !$NO_JUST_ID && $poi_medias > 2 ) {
$add_identifier = true;
}
}
}
// add the AtlasFor identifier to Wikidata
if( $add_identifier ) {
$statement = new StatementExternalID( WIKIDATA_PROPERTY, (string) $poi_ID );
$new_data->addClaim( $statement );
$summary[] = "add [[Property:" . WIKIDATA_PROPERTY. "|AtlasFor ID]]: $poi_ID";
}
// show some imported informations
if( $imports && !$NO_IMPORT ) {
echo "Import\n\t";
echo implode( "\n\t", $imports );
echo "\n";
}
// have we any change to be saved?
if( $summary ) {
// is this export mode?
if( $EXPORT ) {
// edit summary
$summary_raw = implode( "; ", $summary );
// $summary_raw .= " from [[Q65769786|AtlasFor]] {$poi->getPOIPermalinkShort()}";
// show some info
echo "\n";
echo "\n";
echo "POI {$poi->getPOIPermalinkShort()}\n";
echo " " . $poi->getPOIName() . "\n";
echo " " . $poi->get( Comune::NAME ) . "\n";
echo " " . $poi->get( POI::CAPTION ) . "\n";
echo "WIKIDATA https://www.wikidata.org/wiki/{$entity->id}\n";
echo " " . $entity_data->getWhateverLabelValue( $LANGUAGE_PREFERENCES ) . "\n";
echo " " . $entity_data->getWhateverDescriptionValue( $LANGUAGE_PREFERENCES ) . "\n";
echo "SUMMARY:\n";
foreach( $summary as $summary_line ) {
echo " $summary_line\n";
}
// show something interesting
// TODO: the DataValue are not specific, so the getPrintableWikitext() does not work
// $new_data->printChanges();
//print_r( $new_data->get() );
// user interaction
$interaction = readline( "Save? [Y/n/r/d] (Yes/no/refresh/dropID from DB)" );
$interaction = strtolower( $interaction );
if( $interaction === 'r' ) {
$need_to_refresh_poi = true;
} elseif( $interaction == 'd' ) {
echo "Dropping Wikidata ID from our POI...\n";
// drop Wikidata ID
( new QueryPOI() )
->wherePOIID( $poi_ID )
->update( [
POI::WIKIDATA => null,
] );
} elseif( $interaction !== 'n' ) {
echo "Saving...\n";
try {
$new_data->editEntity( [
'summary' => $summary_raw,
'bot' => 1,
] );
} catch( \wb\API\ModificationFailedException $e ) {
\cli\Log::warn( $e->getMessage() );
sleep( 2 );
} catch( \mw\API\PermissionDeniedException $e ) {
\cli\Log::error( $e->getMessage() );
sleep( 2 );
}
} else {
echo "Skipped manually...\n";
}
} else {
echo "Skipped (not in export mode)...\n";
}
}
} while( $need_to_refresh_poi );
// mark this POI as just seen
( new QueryPOI() )
->wherePOIID( $poi_ID )
->update( [
new DBCol( 'poi_lastwikidataimport', 'NOW()', '-' ),
] );
}
}
/**
* A small shortcut to update a single POI
*
* @param POI $poi
* @param array $data
*/
function update_poi( $poi, $data ) {
$data[] = new DBCol( POI::LASTEDIT_USER, 1, 'd' );
$data[] = new DBCol( POI::LASTEDIT_DATE, 'NOW()', '-' );
( new QueryPOI() )
->wherePOI( $poi )
->update( $data );
}
/**
* Given lot of results, split them in small chunks
*
* @param array $entries
* @param int $n
* @generator
*/
function chunk_generator( $entries, $n ) {
$chunk = [];
foreach( $entries as $entry ) {
$chunk[] = $entry;
if( count( $chunk ) > $n ) {
yield $chunk;
$chunk = [];
}
}
if( $chunk ) {
yield $chunk;
}
}
/**
* Check if a Claim contains a reference URL
*
* @param Claim $claim
* @param string $url_part
* @return Snak|false
*/
function find_claim_reference_url( $claim, $url_part ) {
$references = $claim->getReferences();
foreach( $references as $reference ) {
// reference URLs
$snaks = $reference->getSnaksInProperty( 'P854' );
foreach( $snaks as $snak ) {
$snak_datavalue = $snak->getDataValue();
$snak_value = $snak_datavalue->getValue();
if( strpos( $snak_value, $url_part ) !== -1 ) {
return $reference;
}
}
}
return false;
}
/**
* Compare POI's coordinates with some others
*
* @param POI $poi
* @param float $lat2
* @param float $lng2
*/
function has_poi_different_coordinates( $poi, $lat2, $lng2 ) {
$lat1 = $poi->getPOILatitude();
$lng1 = $poi->getPOILongitude();
return are_different_coordinates( $lat1, $lng1, $lat2, $lng2 );
}
/**
* Check if two coordinates are very very similar
*
* NOTE: The numbers are considered the same if their truncation is the same.
*
* @param float $lat1 Latitude (or longitude)
* @param float $lat2 Latitude (or longitude)
* @return boolean
*/
function are_same_coordinates( $lat1, $lat2 ) {
$minlen = min( strlen( $lat1 ), strlen( $lat2 ) );
$lat1 = substr( $lat1, 0, $minlen );
$lat2 = substr( $lat2, 0, $minlen );
return "$lat1" === "$lat2";
}
/**
* Check if two pairs of coordinates are different
*
* @param float $lat1 Latitude (1)
* @param float $lng1 Longitude (1)
* @param float $lat2 Latitude (2)
* @param float $lng2 Longitude (2)
* @return boolean
*/
function are_different_coordinates( $lat1, $lng1, $lat2, $lng2 ) {
return !are_same_coordinates( $lat1, $lat2 )
|| !are_same_coordinates( $lng1, $lng2 );
}