Created
May 2, 2024 18:30
-
-
Save karimkawambwa/1a6a3c4cae11dd99d12ad511e5936da4 to your computer and use it in GitHub Desktop.
arusha press club migration script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| function post_exists_by_title($title) { | |
| global $wpdb; | |
| $query = $wpdb->prepare("SELECT ID FROM $wpdb->posts WHERE post_title = %s AND post_type = 'post' AND post_status = 'publish' LIMIT 1", $title); | |
| return $wpdb->get_var($query); | |
| } | |
| function set_featured_image($post_id, $image_url) { | |
| if (empty($image_url)) return; | |
| $upload_dir = wp_upload_dir(); // WordPress upload directory | |
| $image_data = file_get_contents($image_url); | |
| $filename = urldecode(basename($image_url)); | |
| $file_path = $upload_dir['path'] . '/' . $filename; | |
| $new_file = !file_exists($file_path); | |
| // Save the new file if it doesn't exist | |
| file_put_contents($file_path, $image_data); | |
| if ($new_file) { | |
| $wp_filetype = wp_check_filetype($filename, null); | |
| $attachment = array( | |
| 'post_mime_type' => $wp_filetype['type'], | |
| 'post_title' => sanitize_file_name($filename), | |
| 'post_content' => '', | |
| 'post_status' => 'inherit' | |
| ); | |
| $attach_id = wp_insert_attachment($attachment, $file_path, $post_id); | |
| // Normally you would generate metadata and use wp_update_attachment_metadata() here | |
| } else { | |
| // File exists, find the attachment ID | |
| $attach_id = attachment_url_to_postid($upload_dir['url'] . '/' . $filename); | |
| } | |
| // Set the found or new attachment as the featured image | |
| set_post_thumbnail($post_id, $attach_id); | |
| } | |
| function get_author_id($author_name) { | |
| // Sanitize the provided author name | |
| $author_name = sanitize_text_field($author_name); | |
| // Try to find the user by their display name | |
| $user = get_user_by('name', $author_name); | |
| if ($user) { | |
| // Return existing user ID | |
| return $user->ID; | |
| } else { | |
| // User not found, create a new user | |
| $user_data = array( | |
| 'user_login' => sanitize_title($author_name), | |
| 'user_pass' => wp_generate_password(), // Generate a random password | |
| 'user_email' => sanitize_email($author_name . '@example.com'), // Construct a dummy email | |
| 'display_name' => $author_name, | |
| 'role' => 'author' // Set role to 'author' | |
| ); | |
| $user_id = wp_insert_user($user_data); | |
| if (!is_wp_error($user_id)) { | |
| // Return the new user ID | |
| return $user_id; | |
| } else { | |
| // Handle error; return a default user ID or log error, etc. | |
| error_log($user_id->get_error_message()); | |
| return 1; // Fallback to default admin user ID or another appropriate default | |
| } | |
| } | |
| } | |
| function get_category_id($category_name) { | |
| // Sanitize the provided category name | |
| $category_name = sanitize_text_field($category_name); | |
| // Check if the category already exists | |
| $category = get_term_by('name', $category_name, 'category'); | |
| if ($category) { | |
| // Return existing category ID | |
| return $category->term_id; | |
| } else { | |
| // Category not found, create a new category | |
| $category_data = wp_insert_term( | |
| $category_name, // the term | |
| 'category', // the taxonomy | |
| array( | |
| 'description' => '', | |
| 'slug' => sanitize_title($category_name) | |
| ) | |
| ); | |
| if (!is_wp_error($category_data)) { | |
| // Return the new category ID | |
| return $category_data['term_id']; | |
| } else { | |
| // Handle error; return a default category ID or log error, etc. | |
| error_log($category_data->get_error_message()); | |
| return 1; // Fallback to default category ID or another appropriate default | |
| } | |
| } | |
| } | |
| function fetch_full_post_content($url) { | |
| if (!$url) return 'No content found'; | |
| $ch = curl_init(); | |
| curl_setopt($ch, CURLOPT_URL, $url); | |
| curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
| curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
| curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MyScraperBot/1.0; +http://example.com/bot)'); | |
| $html = curl_exec($ch); | |
| curl_close($ch); | |
| $doc = new DOMDocument(); | |
| libxml_use_internal_errors(true); | |
| $doc->loadHTML($html); | |
| libxml_clear_errors(); | |
| $xpath = new DOMXPath($doc); | |
| $contentNode = $xpath->query("//div[contains(@class, 'post-body entry-content')]")->item(0); | |
| if (!$contentNode) return 'No content found'; | |
| // Extract only the inner HTML of the contentNode | |
| $innerHtml = ''; | |
| foreach ($contentNode->childNodes as $childNode) { | |
| $innerHtml .= $doc->saveHTML($childNode); | |
| } | |
| return $innerHtml; | |
| } | |
| function fetch_posts($url) { | |
| $ch = curl_init(); | |
| curl_setopt($ch, CURLOPT_URL, $url); | |
| curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
| curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
| curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MyScraperBot/1.0; +http://example.com/bot)'); | |
| $html = curl_exec($ch); | |
| curl_close($ch); | |
| $doc = new DOMDocument(); | |
| libxml_use_internal_errors(true); | |
| $doc->loadHTML($html); | |
| libxml_clear_errors(); | |
| $xpath = new DOMXPath($doc); | |
| $postsQuery = "//article[contains(@class, 'post')]"; | |
| $nextPageQuery = "//a[@id='load-more']"; | |
| foreach ($xpath->query($postsQuery) as $post) { | |
| $titleNode = $xpath->query(".//h2[@class='entry-title']/a", $post); | |
| $title = $titleNode->item(0) ? $titleNode->item(0)->nodeValue : 'No title found'; | |
| $postUrl = $titleNode->item(0) ? $titleNode->item(0)->getAttribute('href') : null; | |
| $fullPostContent = fetch_full_post_content($postUrl); | |
| $dateNode = $xpath->query(".//time[@class='published']", $post); | |
| $date = $dateNode->item(0) ? $dateNode->item(0)->getAttribute('datetime') : date('Y-m-d H:i:s'); | |
| $tagNode = $xpath->query(".//span[@class='entry-tag']", $post); | |
| $tag = $tagNode->item(0) ? $tagNode->item(0)->nodeValue : 'Uncategorized'; | |
| $authorNode = $xpath->query(".//span[@class='author-name']", $post); | |
| $authorName = $authorNode->item(0) ? $authorNode->item(0)->nodeValue : 'Default Author'; | |
| // Fetching the featured image URL | |
| $imageNode = $xpath->query(".//a[@class='entry-thumbnail']/span[@class='thumbnail']", $post); | |
| $imageUrl = $imageNode->item(0) ? $imageNode->item(0)->getAttribute('data-src') : null; | |
| if ($imageUrl) { | |
| $urlParts = explode('/', $imageUrl); | |
| // Check if there are enough parts to replace the second last segment | |
| if (count($urlParts) > 2) { | |
| $urlParts[count($urlParts) - 2] = 'w1200-h628'; // Replace the second last segment with 'rw' | |
| $imageUrl = implode('/', $urlParts); | |
| } | |
| } | |
| // Fetching the excerpt | |
| $excerptNode = $xpath->query(".//p[@class='entry-excerpt']", $post); | |
| $excerpt = $excerptNode->item(0) ? $excerptNode->item(0)->nodeValue : ''; | |
| // Process the post here: check existence, create or update | |
| $post_data = array( | |
| 'post_title' => sanitize_text_field($title), | |
| 'post_excerpt' => sanitize_text_field($excerpt), | |
| 'post_content' => wp_kses_post($fullPostContent), | |
| 'post_status' => 'publish', | |
| 'post_author' => get_author_id($authorName), // Function to get/create author by name | |
| 'post_date' => date('Y-m-d H:i:s', strtotime($date)), | |
| 'post_category' => array(get_category_id($tag)) // Function to get/create category by tag | |
| ); | |
| $existing_post_id = post_exists_by_title($post_data['post_title']); | |
| if ($existing_post_id) { | |
| $post_data['ID'] = $existing_post_id; | |
| wp_update_post($post_data); | |
| set_featured_image($existing_post_id, $imageUrl); | |
| } else { | |
| $post_id = wp_insert_post($post_data); | |
| set_featured_image($post_id, $imageUrl); | |
| } | |
| } | |
| $nextPage = $xpath->query($nextPageQuery)->item(0); | |
| if ($nextPage && $nextPage->getAttribute('data-url')) { | |
| fetch_posts($nextPage->getAttribute('data-url')); | |
| } | |
| } | |
| // Start fetching from the first page | |
| fetch_posts('https://www.arushapressclub.or.tz/search'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment