<?php
/**
 * Scraper endpoints.
 *
 * @package Nisje
 */

namespace Dekode\Nisje\Components\Rest;

defined( 'ABSPATH' ) || die( 'Shame on you' );

/**
 * Scraper Rest class.
 */
class Scraper_Controller extends \WP_REST_Controller {

	/**
	 * Constructor.
	 */
	public function __construct() {
		$this->namespace = nisje_get_rest_namespace();
		$this->rest_base = 'scraper';
		$this->hook_base = 'scraper';
	}

	/**
	 * Register the routes.
	 */
	public function register_routes() {
		register_rest_route(
			$this->namespace, '/' . $this->rest_base . '/', [
				[
					'methods'             => \WP_REST_Server::READABLE,
					'permission_callback' => [ $this, 'get_item_permissions_check' ],
					'callback'            => [ $this, 'get_item' ],
					'args'                => [
						'url' => [
							'type'        => 'string',
							'format'      => 'uri',
							'description' => esc_html__( 'URL to scrape', 'nisje' ),
							'required'    => true,
						],
					],
				],
				'schema' => [ $this, 'get_public_item_schema' ],
			]
		);
	}

	/**
	 * Check if a given request has access to fetch og.
	 *
	 * @param  WP_REST_Request $request Full details about the request.
	 * @return WP_Error|boolean
	 */
	public function get_item_permissions_check( $request ) {
		$auth = nisje_validate_rest_authentication( $this->hook_base, 'get_item' );
		if ( is_wp_error( $auth ) ) {
			return $auth;
		}

		return true;
	}

	/**
	 * Fetch OG.
	 *
	 * @param string $data Data to be parsed.
	 * @return array $og Open graph data.
	 */
	public function fetch_og( $data ) {
		$dom = new \DomDocument();
		// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged
		@$dom->loadHTML( mb_convert_encoding( $data, 'HTML-ENTITIES', 'UTF-8' ) );

		$xpath = new \DOMXPath( $dom );
		// Query metatags with og prefix.
		$metas = $xpath->query( '//*/meta[starts-with(@property, \'og:\')]' );

		$og = [];

		foreach ( $metas as $meta ) {
			// Get property name without og: prefix.
			$property = str_replace( 'og:', '', $meta->getAttribute( 'property' ) );

			// Get content.
			$content = $meta->getAttribute( 'content' );

			$og[ $property ] = $content;
		}

		return $og;
	}

	/**
	 * Retrieve graph data from a website.
	 *
	 * @param WP_REST_Request $request Rest Request.
	 * @return WP_REST_Request|WP_Error Plugin object data on success, WP_Error otherwise.
	 */
	public function get_item( $request ) {
		$url = $request['url'];

		if ( strlen( $url ) === 0 ) {
			return new \WP_Error(
				'nisje_rest_scraper_invalid_url', esc_html__( 'Empty URL. Please provide a valid URL.', 'nisje' ), [
					'status' => 404,
				]
			);
		}

		$uri = esc_url( wp_unslash( $url ) );
		$uri = preg_replace( '/\/#.+?$/', '', $uri );

		if ( preg_match( '/\.(jpe?g|jpe|gif|png)\b/i', $uri ) ) {
			return new \WP_Error(
				'nisje_rest_scraper_invalid_url', esc_html__( 'Please provide a valid URL.', 'nisje' ), [
					'status' => 401,
				]
			);
		}

		$response = wp_remote_get(
			$uri, [
				'user-agent' => 'Nisje',
			]
		);

		if ( is_wp_error( $response ) ) {
			return new \WP_Error(
				'nisje_rest_scraper_invalid_url', $response->get_error_message(), [
					'status' => 400,
				]
			);
		}

		$content = wp_remote_retrieve_body( $response );

		if ( false === $content ) {
			return new \WP_Error(
				'nisje_rest_scraper_invalid_url', esc_html__( 'Site not found.', 'nisje' ), [
					'status' => 404,
				]
			);
		}

		$page = [];
		$og   = $this->fetch_og( $content );

		if ( isset( $og['title'] ) ) {
			$page_title = html_entity_decode( $og['title'] );
		} else {
			preg_match( '/<title[^>]*>([^<]+)<\/title>/im', $content, $titles );
			$page_title = html_entity_decode( $titles[1] );
		}

		$page['title']       = trim( $page_title );
		$page['is_og_image'] = false;

		if ( isset( $og['image'] ) ) {
			$page['images']      = [ $og['image'] ];
			$page['is_og_image'] = true;
		} else {
			$host    = wp_parse_url( $uri );
			$pattern = '/<img ([^>]*)src=(\"|\')([^<>\'\"]+)(\2)([^>]*)\/*>/i';
			$content = str_replace( [ "\n", "\t", "\r" ], '', $content );
			preg_match_all( $pattern, $content, $matches );

			if ( empty( $matches[0] ) ) {
				$page['images'] = [];
			} else {

				$sources = [];

				foreach ( $matches[3] as $src ) {
					if ( strpos( $src, '//' ) === false ) {
						$src = 'http://' . str_replace( '//', '/', $host['host'] . '/' . $src );
					}

					$sources[] = esc_url( $src );
				}

				$page['images'] = $sources;
			}
		}

		$data     = $this->prepare_item_for_response( $page, $request );
		$response = rest_ensure_response( $data );

		return $response;
	}

	/**
	 * Prepares scraper data for return as an object.
	 *
	 * @param stdClass        $item    Item data.
	 * @param WP_REST_Request $request Rest Request.
	 * @param boolean         $is_raw  Optional, not used. Defaults to false.
	 *
	 * @return WP_REST_Response
	 */
	public function prepare_item_for_response( $item, $request, $is_raw = false ) {
		$schema = $this->get_item_schema();

		$data = [];

		if ( ! empty( $schema['properties']['title'] ) ) {
			$data['title'] = esc_html( $item['title'] );
		}

		if ( ! empty( $schema['properties']['images'] ) ) {
			$data['images'] = $item['images'];
		}

		if ( ! empty( $schema['properties']['is_og_image'] ) ) {
			$data['is_og_image'] = (bool) $item['is_og_image'];
		}

		$context = ! empty( $request['context'] ) ? $request['context'] : 'view';
		$data    = $this->add_additional_fields_to_object( $data, $request );
		$data    = $this->filter_response_by_context( $data, $context );

		$response = rest_ensure_response( $data );

		/**
		 * Filter an scraper value returned from the API.
		 *
		 * @param array           $response
		 * @param WP_REST_Request $request Request used to generate the response.
		 */
		return apply_filters( 'nisje_rest_prepare_scraper_value', $response, $item, $request );
	}

	/**
	 * Get the plugin schema, conforming to JSON Schema.
	 *
	 * @return array
	 */
	public function get_item_schema() {
		$schema = [
			'$schema'    => 'http://json-schema.org/draft-04/schema#',
			'title'      => $this->hook_base,
			'type'       => 'object',
			'properties' => [
				'title'       => [
					'description' => esc_html__( 'OG Title.', 'nisje' ),
					'type'        => 'string',
					'context'     => [ 'view', 'edit' ],
					'arg_options' => [
						'sanitize_callback' => 'wp_filter_post_kses',
					],
					'readonly'    => true,
				],
				'images'      => [
					'description' => esc_html__( 'OG Images.', 'nisje' ),
					'type'        => 'array',
					'items'       => [
						'type'   => 'string',
						'format' => 'uri',
					],
					'context'     => [ 'view', 'edit' ],
					'readonly'    => true,
				],
				'is_og_image' => [
					'description' => esc_html__( 'Whether or not the image is OG.', 'nisje' ),
					'type'        => 'boolean',
					'context'     => [ 'view', 'edit' ],
					'readonly'    => true,
				],
			],
		];

		return $this->add_additional_fields_schema( $schema );
	}
}
