Wordpress on the other hand is PHP, and I just suck at that. So there you go... Anyhow, after setting up my document types in Umbraco I needed to figure out how to get all my old content into the new site. Wordpress offers to export the entire content as xml, so that part was easy. The exported file was 3Mb, mainly because of some sort of screwed up tags back from when I was using the Ultimate Tag Warrior (I will miss the cool plugin names from Wordpress), which spit out a whole lot of empty tags.

The exported format is basically an RSS feed, but with some extra elements added by wordpress. One of those is an <excerpts:encoded> element, which does not have a namespace declaration at the top, thus making it invalid xml. So I needed to fix this before handling the file in my import routine. I just added it to the rss element:

<rss version="2.0"

xmlns:content="http://purl.org/rss/1.0/modules/content/"

xmlns:excerpt="http://purl.org/rss/1.0/modules/excerpt/"

xmlns:wfw="http://wellformedweb.org/CommentAPI/"

xmlns:dc="http://purl.org/dc/elements/1.1/"

xmlns:wp="http://wordpress.org/export/1.0/">

Sweet, now the xml is all nice and tidy and ready to be imported. So, how to do the import? Well, I decided to do it through the Umbraco API using a dashboard usercontrol. To get the content from the XML file, I chose to go with Linq2Xml which is pretty neat for navigating through the XML file. First thing I did was to disable some Lucene lock, because it made my import fail due to the number of operations done. I also set the script timeout value a bit high just to be sure:

Server.ScriptTimeout = 300;

Lucene.Net.Store.FSDirectory.SetDisableLocks(true);  

Now, to load the Xml file. Pretty easy. I later added the possibility to enter the XML in a textarea instead, thus the commented out line:

XDocument loaded = XDocument.Load(Server.MapPath("~/usercontrols/wordpress.2009-08-01.xml"));

XNamespace wpns = XNamespace.Get("http://wordpress.org/export/1.0/");

XNamespace contentns = XNamespace.Get("http://purl.org/rss/1.0/modules/content/");

var q = from c in loaded.Descendants("item")

  where (string)c.Element(wpns + "post_type") == "post"

  select c;

So now I got all my blogposts in the variable "q". time to feed them into Umbraco. It's not too nicely structured, but it does the job, and it's a one time deal, so no need to go crazy here.

DocumentType dt = DocumentType.GetByAlias("BlogPost");

User author = User.GetUser(0);

foreach (XElement item in q)

{

string posttitle = (string)item.Element("title");

string legacyurl = ((string)item.Element("link")).Replace("", string.Empty);

string legacyid = (string)item.Element(wpns + "post_id");

string posturlnodename = Server.UrlDecode((string)item.Element(wpns + "post_name"));

string postbody = (string)item.Element(contentns + "encoded");

string posttags = string.Empty;

DateTime createdate = DateTime.Parse((string)item.Element(wpns + "post_date"));

int i = 0;

foreach (XElement tag in item.Elements("category"))

{

if ((string)tag.Attribute("domain") == "tag" && !string.IsNullOrEmpty((string)tag.Attribute("nicename")))

{

if (i > 0)

 {

   posttags += ",";

 }

   posttags += (string)tag.Attribute("nicename");

   i++;

}

}

Document doc = Document.MakeNew(posturlnodename, dt, author, 1049);

doc.getProperty("blogPostTitle").Value = posttitle;

doc.getProperty("blogPostBody").Value = WordpressPostParser.ParseCodeBlocks(WordpressPostParser.ChangeImageUrls(WordpressPostParser.CreateParagraphTags(postbody)));

doc.getProperty("blogPostLegacyUrl").Value = legacyurl;

doc.getProperty("blogPostLegacyID").Value = legacyid;

doc.CreateDateTime = createdate;

if (!string.IsNullOrEmpty(posttags))

{

umbraco.editorControls.tags.library.addTagsToNode(doc.Id, posttags, "default");

doc.getProperty("blogPostTags").Value = posttags;

}

doc.Publish(author);

umbraco.library.UpdateDocumentCache(doc.Id);

//comments here...

foreach (XElement comment in item.Elements(wpns + "comment"))

{

if ((string)comment.Element(wpns + "comment_approved") == "1")

{

string commentAuthor = (string)comment.Element(wpns + "comment_author");

string commentEmail = (string)comment.Element(wpns + "comment_author_email");

string commentUrl = (string)comment.Element(wpns + "comment_author_url");

string commentIP = (string)comment.Element(wpns + "comment_author_IP");

string commentBody = (string)comment.Element(wpns + "comment_content");

DateTime commentDate = DateTime.Parse((string)comment.Element(wpns + "comment_date"));       

Document commentdoc = Document.MakeNew(commentAuthor, DocumentType.GetByAlias("BlogComment"), author, doc.Id);

commentdoc.getProperty("blogCommentAuthor").Value = commentAuthor;

commentdoc.getProperty("blogCommentAuthorEmail").Value = commentEmail;

commentdoc.getProperty("blogCommentAuthorURL").Value = commentUrl;

commentdoc.getProperty("blogCommentAuthorIP").Value = commentIP;

commentdoc.getProperty("blogCommentBody").Value = commentBody;

commentdoc.CreateDateTime = commentDate;

commentdoc.Publish(author);

umbraco.library.UpdateDocumentCache(commentdoc.Id);

}

}

}

I am using some external methods to parse the body text of the posts. This is because Wordpress doesn't save html, but puts in linebreaks and renders paragraph tags at render time... brrrr... There are also some [source] tags leftover from the syntax highlighter plugin that I need to change:

These are the three methods I am using to parse the text:

public static string CreateParagraphTags(string postbody)

{

StringBuilder sb = new StringBuilder();

sb.Append("<p>");

sb.Append(postbody.Replace("\n\n", "</p><p>"));

sb.Append("</p>");

return sb.ToString();

}

public static string ChangeImageUrls(string postbody)

{

string parsedstring = Regex.Replace(postbody, "src=\"/wp-content", "src=\"/media/images", RegexOptions.Singleline);

return Regex.Replace(parsedstring, "href=\"/wp-content", "href=\"/media/images", RegexOptions.Singleline);

}

public static string ParseCodeBlocks(string postbody)

{

Regex regPattern = new Regex(@"(\[source(.*?)\])(.*?)(\[/source\])", RegexOptions.Singleline);

Dictionary<string, string> replaceValues = new Dictionary<string, string>();

int i = 0;

foreach (Match match in regPattern.Matches(postbody))

{

string code = match.Groups[3].Value;

if (code.Contains("<"))

{

code = code.Replace("<", "&lt;").Replace(">", "&gt;");

}

postbody = postbody.Replace(match.Value, string.Format("[[[replacecode{0}]]]", i));

replaceValues.Add(string.Format("[[[replacecode{0}]]]", i), "<pre>" + code + "</pre>");

i++;

}

foreach (KeyValuePair<string, string> replaceValue in replaceValues)

{

postbody = postbody.Replace(replaceValue.Key, replaceValue.Value);

}

return postbody;

}

It's not perfect. For example it added some strange <p> tags inside my code blocks, but no more than I could handle by doing manual updates. For these methods I added some unit tests. It is just so much nicer to work with RegEx when you have tests to see if you are breaking existing matches while changing this stuff.