Jeffrey Veen wrote a blog post on content migration a couple of days ago.
Two things that can make life much easier is well-formed (and even better: semantic) markup and using a cms with a well-formed API.
Recently I helped a company migrating all there latest news item into umbraco and weren't stored in a cms, only as html. Luckly it was quite strict mark-up which made it quite easy to import. Because it was valid, it allowed me to import the news items as xml, which made everything very easy.
Here's the source code for importing:
private void Page_Load(object sender, System.EventArgs e)
{
DocumentType blogPostType = DocumentType.GetByAlias("BlogPost");
User u = new User(2);
string dato = "";
string header = "";
XmlDocument ebitaContent = new XmlDocument();
ebitaContent.Load(@"C:\Documents and Settings\Niels Hartvig\Skrivebord\ebita.xml");
foreach (XmlNode post in ebitaContent.DocumentElement.ChildNodes)
{
if (post.Attributes != null && post.Attributes.GetNamedItem("class") != null &&
post.Attributes.GetNamedItem("class").Value == "headerSpace")
{
foreach (XmlNode fragment in post.ChildNodes)
{
if (fragment.Attributes != null && fragment.Attributes.GetNamedItem("class") != null &&
fragment.Attributes.GetNamedItem("class").Value == "newsDato")
dato = umbraco.xmlHelper.GetNodeValue(fragment);
else if (fragment.Attributes != null && fragment.Attributes.GetNamedItem("class") != null &&
fragment.Attributes.GetNamedItem("class").Value == "newsHeader")
header = umbraco.xmlHelper.GetNodeValue(fragment);
else if (dato != "" && header != "" && umbraco.xmlHelper.GetNodeValue(fragment) != "")
{
Document blogPost = Document.MakeNew(
header,
blogPostType,
u,
1056);
blogPost.CreateDateTime = DateTime.Parse(dato);
blogPost.getProperty("bodyText").Value = fragment.InnerXml;
blogPost.Publish(u);
dato = "";
header = "";
}
}
}
}
}
}