Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save karimkawambwa/1a6a3c4cae11dd99d12ad511e5936da4 to your computer and use it in GitHub Desktop.
Save karimkawambwa/1a6a3c4cae11dd99d12ad511e5936da4 to your computer and use it in GitHub Desktop.
arusha press club migration script
<?php
function post_exists_by_title($title) {
global $wpdb;
$query = $wpdb->prepare("SELECT ID FROM $wpdb->posts WHERE post_title = %s AND post_type = 'post' AND post_status = 'publish' LIMIT 1", $title);
return $wpdb->get_var($query);
}
function set_featured_image($post_id, $image_url) {
if (empty($image_url)) return;
$upload_dir = wp_upload_dir(); // WordPress upload directory
$image_data = file_get_contents($image_url);
$filename = urldecode(basename($image_url));
$file_path = $upload_dir['path'] . '/' . $filename;
$new_file = !file_exists($file_path);
// Save the new file if it doesn't exist
file_put_contents($file_path, $image_data);
if ($new_file) {
$wp_filetype = wp_check_filetype($filename, null);
$attachment = array(
'post_mime_type' => $wp_filetype['type'],
'post_title' => sanitize_file_name($filename),
'post_content' => '',
'post_status' => 'inherit'
);
$attach_id = wp_insert_attachment($attachment, $file_path, $post_id);
// Normally you would generate metadata and use wp_update_attachment_metadata() here
} else {
// File exists, find the attachment ID
$attach_id = attachment_url_to_postid($upload_dir['url'] . '/' . $filename);
}
// Set the found or new attachment as the featured image
set_post_thumbnail($post_id, $attach_id);
}
function get_author_id($author_name) {
// Sanitize the provided author name
$author_name = sanitize_text_field($author_name);
// Try to find the user by their display name
$user = get_user_by('name', $author_name);
if ($user) {
// Return existing user ID
return $user->ID;
} else {
// User not found, create a new user
$user_data = array(
'user_login' => sanitize_title($author_name),
'user_pass' => wp_generate_password(), // Generate a random password
'user_email' => sanitize_email($author_name . '@example.com'), // Construct a dummy email
'display_name' => $author_name,
'role' => 'author' // Set role to 'author'
);
$user_id = wp_insert_user($user_data);
if (!is_wp_error($user_id)) {
// Return the new user ID
return $user_id;
} else {
// Handle error; return a default user ID or log error, etc.
error_log($user_id->get_error_message());
return 1; // Fallback to default admin user ID or another appropriate default
}
}
}
function get_category_id($category_name) {
// Sanitize the provided category name
$category_name = sanitize_text_field($category_name);
// Check if the category already exists
$category = get_term_by('name', $category_name, 'category');
if ($category) {
// Return existing category ID
return $category->term_id;
} else {
// Category not found, create a new category
$category_data = wp_insert_term(
$category_name, // the term
'category', // the taxonomy
array(
'description' => '',
'slug' => sanitize_title($category_name)
)
);
if (!is_wp_error($category_data)) {
// Return the new category ID
return $category_data['term_id'];
} else {
// Handle error; return a default category ID or log error, etc.
error_log($category_data->get_error_message());
return 1; // Fallback to default category ID or another appropriate default
}
}
}
function fetch_full_post_content($url) {
if (!$url) return 'No content found';
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MyScraperBot/1.0; +http://example.com/bot)');
$html = curl_exec($ch);
curl_close($ch);
$doc = new DOMDocument();
libxml_use_internal_errors(true);
$doc->loadHTML($html);
libxml_clear_errors();
$xpath = new DOMXPath($doc);
$contentNode = $xpath->query("//div[contains(@class, 'post-body entry-content')]")->item(0);
if (!$contentNode) return 'No content found';
// Extract only the inner HTML of the contentNode
$innerHtml = '';
foreach ($contentNode->childNodes as $childNode) {
$innerHtml .= $doc->saveHTML($childNode);
}
return $innerHtml;
}
function fetch_posts($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MyScraperBot/1.0; +http://example.com/bot)');
$html = curl_exec($ch);
curl_close($ch);
$doc = new DOMDocument();
libxml_use_internal_errors(true);
$doc->loadHTML($html);
libxml_clear_errors();
$xpath = new DOMXPath($doc);
$postsQuery = "//article[contains(@class, 'post')]";
$nextPageQuery = "//a[@id='load-more']";
foreach ($xpath->query($postsQuery) as $post) {
$titleNode = $xpath->query(".//h2[@class='entry-title']/a", $post);
$title = $titleNode->item(0) ? $titleNode->item(0)->nodeValue : 'No title found';
$postUrl = $titleNode->item(0) ? $titleNode->item(0)->getAttribute('href') : null;
$fullPostContent = fetch_full_post_content($postUrl);
$dateNode = $xpath->query(".//time[@class='published']", $post);
$date = $dateNode->item(0) ? $dateNode->item(0)->getAttribute('datetime') : date('Y-m-d H:i:s');
$tagNode = $xpath->query(".//span[@class='entry-tag']", $post);
$tag = $tagNode->item(0) ? $tagNode->item(0)->nodeValue : 'Uncategorized';
$authorNode = $xpath->query(".//span[@class='author-name']", $post);
$authorName = $authorNode->item(0) ? $authorNode->item(0)->nodeValue : 'Default Author';
// Fetching the featured image URL
$imageNode = $xpath->query(".//a[@class='entry-thumbnail']/span[@class='thumbnail']", $post);
$imageUrl = $imageNode->item(0) ? $imageNode->item(0)->getAttribute('data-src') : null;
if ($imageUrl) {
$urlParts = explode('/', $imageUrl);
// Check if there are enough parts to replace the second last segment
if (count($urlParts) > 2) {
$urlParts[count($urlParts) - 2] = 'w1200-h628'; // Replace the second last segment with 'rw'
$imageUrl = implode('/', $urlParts);
}
}
// Fetching the excerpt
$excerptNode = $xpath->query(".//p[@class='entry-excerpt']", $post);
$excerpt = $excerptNode->item(0) ? $excerptNode->item(0)->nodeValue : '';
// Process the post here: check existence, create or update
$post_data = array(
'post_title' => sanitize_text_field($title),
'post_excerpt' => sanitize_text_field($excerpt),
'post_content' => wp_kses_post($fullPostContent),
'post_status' => 'publish',
'post_author' => get_author_id($authorName), // Function to get/create author by name
'post_date' => date('Y-m-d H:i:s', strtotime($date)),
'post_category' => array(get_category_id($tag)) // Function to get/create category by tag
);
$existing_post_id = post_exists_by_title($post_data['post_title']);
if ($existing_post_id) {
$post_data['ID'] = $existing_post_id;
wp_update_post($post_data);
set_featured_image($existing_post_id, $imageUrl);
} else {
$post_id = wp_insert_post($post_data);
set_featured_image($post_id, $imageUrl);
}
}
$nextPage = $xpath->query($nextPageQuery)->item(0);
if ($nextPage && $nextPage->getAttribute('data-url')) {
fetch_posts($nextPage->getAttribute('data-url'));
}
}
// Start fetching from the first page
fetch_posts('https://www.arushapressclub.or.tz/search');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment