/* Copyright (C) 2001 Andreas Schlapbach <schlpbch@iam.unibe.ch> Copyright (C) 2003 Antonio Larrosa <larrosa@kde.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; see the file COPYING. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "archivedialog.h" #include <tqwidget.h> #include <tdehtml_part.h> #include "archiveviewbase.h" #include <kinstance.h> #include <ktempfile.h> #include <ktar.h> #include <tdefiledialog.h> #include <kmessagebox.h> #include <kpassivepopup.h> #include <klocale.h> #include <tdeio/netaccess.h> #include <tdehtml_part.h> #include <kdebug.h> #include <kgenericfactory.h> #include <kactivelabel.h> #include <tqstylesheet.h> #include <tqiodevice.h> #include <klistview.h> #include <tdeio/job.h> #include <kapplication.h> #include <kurllabel.h> #include <kprogress.h> #include <kstringhandler.h> #include <tqpushbutton.h> #undef DEBUG_WAR #define CONTENT_TYPE "<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">" ArchiveDialog::ArchiveDialog(TQWidget *parent, const TQString &filename, TDEHTMLPart *part) : KDialogBase(parent, "WebArchiveDialog", false, i18n("Web Archiver"), KDialogBase::Ok | KDialogBase::Cancel | KDialogBase::User1 ), m_bPreserveWS(false), m_tmpFile(0), m_url(part->url()) { m_widget=new ArchiveViewBase(this); setMainWidget(m_widget); setWFlags(getWFlags() | WDestructiveClose); m_widget->urlLabel->setText(TQString("<a href=\"")+m_url.url()+"\">"+KStringHandler::csqueeze( m_url.url(), 80 )+"</a>"); m_widget->targetLabel->setText(TQString("<a href=\"")+filename+"\">"+KStringHandler::csqueeze( filename, 80 )+"</a>"); if(part->document().ownerDocument().isNull()) m_document = part->document(); else m_document = part->document().ownerDocument(); enableButtonOK( false ); showButton( KDialogBase::User1, false ); setButtonOK( KStdGuiItem::close() ); m_tarBall = new KTar(filename,"application/x-gzip"); } void ArchiveDialog::archive() { m_iterator=0; m_currentLVI=0; if (m_tarBall->open(IO_WriteOnly)) { #ifdef DEBUG_WAR kdDebug(90110) << "Web Archive opened " << endl; #endif m_linkDict.insert(TQString("index.html"), TQString("")); saveFile("index.html"); } else { const TQString title = i18n( "Unable to Open Web-Archive" ); const TQString text = i18n( "Unable to open \n %1 \n for writing." ).arg(m_tarBall->fileName()); KMessageBox::sorry( 0L, text, title ); } } ArchiveDialog::~ArchiveDialog() { delete m_tarBall; } /* Store the HTMLized DOM-Tree to a temporary file and add it to the Tar-Ball */ void ArchiveDialog::saveFile( const TQString&) { KTempFile tmpFile; if (!(tmpFile.status())) { TQString temp; m_state=Retrieving; TQTextStream *tempStream = new TQTextStream(&temp, IO_ReadOnly); saveToArchive(tempStream); delete tempStream; m_downloadedURLDict.clear(); m_state=Downloading; m_widget->progressBar->setTotalSteps(m_urlsToDownload.count()); m_widget->progressBar->setProgress(0); downloadNext(); } else { const TQString title = i18n( "Could Not Open Temporary File" ); const TQString text = i18n( "Could not open a temporary file" ); KMessageBox::sorry( 0, text, title ); } } void ArchiveDialog::setSavingState() { KTempFile tmpFile; TQTextStream* textStream = tmpFile.textStream(); textStream->setEncoding(TQTextStream::UnicodeUTF8); m_widget->progressBar->setProgress(m_widget->progressBar->totalSteps()); m_state=Saving; saveToArchive(textStream); tmpFile.close(); TQString fileName="index.html"; TQFile file(tmpFile.name()); file.open(IO_ReadOnly); m_tarBall->writeFile(fileName, TQString(), TQString(), file.size(), file.readAll()); #ifdef DEBUG_WAR kdDebug(90110) << "HTML-file written: " << fileName << endl; #endif file.close(); // Cleaning up file.remove(); m_tarBall->close(); KPassivePopup::message( m_url.prettyURL() , i18n( "Archiving webpage completed." ), this ); enableButtonOK(true); setEscapeButton(Ok); actionButton(Ok)->setFocus(); enableButtonCancel(false); } /* Recursively travers the DOM-Tree */ void ArchiveDialog::saveToArchive(TQTextStream* _textStream) { if (!_textStream) return; // Add a doctype (*_textStream) <<"<!-- saved from:" << endl << m_url.url() << " -->" << endl; try { saveArchiveRecursive(m_document.documentElement(), m_url, _textStream, 0); } catch (...) { kdDebug(90110) << "exception" << endl; } } static bool hasAttribute(const DOM::Node &pNode, const TQString &attrName, const TQString &attrValue) { const DOM::Element element = (const DOM::Element) pNode; DOM::Attr attr; DOM::NamedNodeMap attrs = element.attributes(); unsigned long lmap = attrs.length(); for( unsigned int j=0; j<lmap; j++ ) { attr = static_cast<DOM::Attr>(attrs.item(j)); if ((attr.name().string().upper() == attrName) && (attr.value().string().upper() == attrValue)) return true; } return false; } static bool hasChildNode(const DOM::Node &pNode, const TQString &nodeName) { DOM::Node child; try { // We might throw a DOM exception child = pNode.firstChild(); } catch (...) { // No children, stop recursion here child = DOM::Node(); } while(!child.isNull()) { if (child.nodeName().string().upper() == nodeName) return true; child = child.nextSibling(); } return false; } /* Transform DOM-Tree to HTML */ void ArchiveDialog::saveArchiveRecursive(const DOM::Node &pNode, const KURL& baseURL, TQTextStream* _textStream, int indent) { const TQString nodeNameOrig(pNode.nodeName().string()); const TQString nodeName(pNode.nodeName().string().upper()); TQString text; TQString strIndent; strIndent.fill(' ', indent); const DOM::Element element = (const DOM::Element) pNode; DOM::Node child; if ( !element.isNull() ) { if (nodeName.at(0)=='-') { /* Don't save tdehtml internal tags '-konq..' * Approximating it with <DIV> */ text += "<DIV> <!-- -KONTQ_BLOCK -->"; } else if (nodeName == "BASE") { /* Skip BASE, everything is relative to index.html * Saving SCRIPT but they can cause trouble! */ } else if ((nodeName == "META") && hasAttribute(pNode, "HTTP-EQUIV", "CONTENT-TYPE")) { /* Skip content-type meta tag, we provide our own. */ } else { if (!m_bPreserveWS) { if (nodeName == "PRE") { m_bPreserveWS = true; } text = strIndent; } text += "<" + nodeNameOrig; TQString attributes; TQString attrNameOrig, attrName, attrValue; DOM::Attr attr; DOM::NamedNodeMap attrs = element.attributes(); unsigned long lmap = attrs.length(); for( unsigned int j=0; j<lmap; j++ ) { attr = static_cast<DOM::Attr>(attrs.item(j)); attrNameOrig = attr.name().string(); attrName = attrNameOrig.upper(); attrValue = attr.value().string(); #if 0 if ((nodeName == "FRAME" || nodeName == "IFRAME") && attrName == "SRC") { //attrValue = handleLink(baseURL, attrValue); /* Going recursively down creating a DOM-Tree for the Frame, second Level of recursion */ //## Add Termination criteria, on the other hand frames are not indefinetly nested, are they :) TDEHTMLPart* part = new TDEHTMLPart(); KURL absoluteURL = getAbsoluteURL(baseURL, attrValue); part->openURL(absoluteURL); saveFile(getUniqueFileName(absoluteURL.fileName()), part); delete part; } else if #endif if ((nodeName == "LINK" && attrName == "HREF") || // Down load stylesheets, js-script, .. ((nodeName == "FRAME" || nodeName == "IFRAME") && attrName == "SRC") || ((nodeName == "IMG" || nodeName == "INPUT" || nodeName == "SCRIPT") && attrName == "SRC") || ((nodeName == "BODY" || nodeName == "TABLE" || nodeName == "TH" || nodeName == "TD") && attrName == "BACKGROUND")) { // Some people use carriage return in file names and browsers support that! attrValue = handleLink(baseURL, attrValue.replace(TQRegExp("\\s"), "")); } /* * ## Make recursion level configurable */ /* } else if (nodeName == "A" && attrName == "HREF") { attrValue = handleLink(baseURL, attrValue); */ attributes += " " + attrName + "=\"" + attrValue + "\""; } if (!(attributes.isEmpty())){ text += " "; } text += attributes.simplifyWhiteSpace(); text += ">"; if (nodeName == "HTML") { /* Search for a HEAD tag, if not found, generate one. */ if (!hasChildNode(pNode, "HEAD")) text += "\n" + strIndent + " <HEAD>" CONTENT_TYPE "</HEAD>"; } else if (nodeName == "HEAD") { text += "\n" + strIndent + " " + CONTENT_TYPE; } } } else { const TQString& nodeValue(pNode.nodeValue().string()); if (!(nodeValue.isEmpty())) { // Don't escape < > in JS or CSS TQString parentNodeName = pNode.parentNode().nodeName().string().upper(); if (parentNodeName == "STYLE") { text = analyzeInternalCSS(baseURL, pNode.nodeValue().string()); } else if (m_bPreserveWS) { text = TQStyleSheet::escape(pNode.nodeValue().string()); } else if (parentNodeName == "SCRIPT") { text = pNode.nodeValue().string(); } else { text = strIndent + TQStyleSheet::escape(pNode.nodeValue().string()); } } } #ifdef DEBUG_WAR kdDebug(90110) << "text:" << text << endl; #endif if (!(text.isEmpty())) { (*_textStream) << text; if (!m_bPreserveWS) { (*_textStream) << endl; } } try { // We might throw a DOM exception child = pNode.firstChild(); } catch (...) { // No children, stop recursion here child = DOM::Node(); } while(!child.isNull()) { saveArchiveRecursive(child, baseURL, _textStream, indent+2); child = child.nextSibling(); } if (!(element.isNull())) { if (nodeName == "AREA" || nodeName == "BASE" || nodeName == "BASEFONT" || nodeName == "BR" || nodeName == "COL" || nodeName == "FRAME" || nodeName == "HR" || nodeName == "IMG" || nodeName == "INPUT" || nodeName == "ISINDEX" || nodeName == "META" || nodeName == "PARAM") { /* Closing Tag is forbidden, see HTML 4.01 Specs: Index of Elements */ } else { if (!m_bPreserveWS) { text = strIndent; } else { text =""; } if (nodeName.at(0)=='-') { text += "</DIV> <!-- -KONTQ_BLOCK -->"; } else { text += "</" + pNode.nodeName().string() + ">"; if (nodeName == "PRE") { m_bPreserveWS = false; } } #ifdef DEBUG_WAR kdDebug(90110) << text << endl; #endif if (!(text.isEmpty())) { (*_textStream) << text; if (!m_bPreserveWS) { (*_textStream) << endl; } } } } } /* Extract the URL, download it's content and return an unique name for the link */ TQString ArchiveDialog::handleLink(const KURL& _url, const TQString& _link) { KURL url(getAbsoluteURL(_url, _link)); TQString tarFileName; if (kapp->authorizeURLAction("redirect", _url, url)) { if (m_state==Retrieving) m_urlsToDownload.append(url); else if (m_state==Saving) tarFileName = m_downloadedURLDict[url.url()]; } return tarFileName; } void ArchiveDialog::downloadNext() { if (m_iterator>=m_urlsToDownload.count()) { // We've already downloaded all the files we wanted, let's save them setSavingState(); return; } KURL url=m_urlsToDownload[m_iterator]; #ifdef DEBUG_WAR kdDebug(90110) << "URL : " << url.url() << endl; #endif TQString tarFileName; // Only download file once if (m_downloadedURLDict.contains(url.url())) { tarFileName = m_downloadedURLDict[url.url()]; #ifdef DEBUG_WAR kdDebug(90110) << "File already downloaded: " << url.url() << m_downloadedURLDict.count() << endl; #endif m_iterator++; downloadNext(); return; } else { // Gets the name of a temporary file into m_tmpFileName delete m_tmpFile; m_tmpFile=new KTempFile(); m_tmpFile->close(); TQFile::remove(m_tmpFile->name()); kdDebug(90110) << "downloading: " << url.url() << " to: " << m_tmpFile->name() << endl; KURL dsturl; dsturl.setPath(m_tmpFile->name()); TDEIO::Job *job=TDEIO::file_copy(url, dsturl, -1, false, false, false); job->addMetaData("cache", "cache"); // Use entry from cache if available. connect(job, TQT_SIGNAL(result( TDEIO::Job *)), this, TQT_SLOT(finishedDownloadingURL( TDEIO::Job *)) ); m_currentLVI=new TQListViewItem(m_widget->listView, url.prettyURL()); m_widget->listView->insertItem( m_currentLVI ); m_currentLVI->setText(1,i18n("Downloading")); } #ifdef DEBUG_WAR kdDebug(90110) << "TarFileName: [" << tarFileName << "]" << endl << endl; #endif } void ArchiveDialog::finishedDownloadingURL( TDEIO::Job *job ) { if ( job->error() ) { // TQString s=job->errorString(); m_currentLVI->setText(1,i18n("Error")); } else m_currentLVI->setText(1,i18n("Ok")); m_widget->progressBar->advance(1); KURL url=m_urlsToDownload[m_iterator]; TQString tarFileName = getUniqueFileName(url.fileName()); // Add file to Tar-Ball TQFile file(m_tmpFile->name()); file.open(IO_ReadOnly); m_tarBall->writeFile(tarFileName, TQString(), TQString(), file.size(), file.readAll()); file.close(); m_tmpFile->unlink(); delete m_tmpFile; m_tmpFile=0; // Add URL to downloaded URLs m_downloadedURLDict.insert(url.url(), tarFileName); m_linkDict.insert(tarFileName, TQString("")); m_iterator++; downloadNext(); } /* Create an absolute URL for download */ KURL ArchiveDialog::getAbsoluteURL(const KURL& _url, const TQString& _link) { // Does all the magic for me return KURL(_url, _link); } /* Adds an id to a fileName to make it unique relative to the Tar-Ball */ TQString ArchiveDialog::getUniqueFileName(const TQString& fileName) { // Name clash -> add unique id static int id=2; TQString uniqueFileName(fileName); #ifdef DEBUG_WAR kdDebug(90110) << "getUniqueFileName(..): [" << fileName << "]" << endl; #endif while (uniqueFileName.isEmpty() || m_linkDict.contains(uniqueFileName)) uniqueFileName = TQString::number(id++) + fileName; return uniqueFileName; } /* Search for Images in CSS, extract them and adjust CSS */ TQString ArchiveDialog::analyzeInternalCSS(const KURL& _url, const TQString& string) { #ifdef DEBUG_WAR kdDebug () << "analyzeInternalCSS" << endl; #endif TQString str(string); int pos = 0; int startUrl = 0; int endUrl = 0; int length = string.length(); while (pos < length && pos >= 0) { pos = str.find("url(", pos); if (pos!=-1) { pos += 4; // url( if (str[pos]=='"' || str[pos]=='\'') // CSS 'feature' pos++; startUrl = pos; pos = str.find(")",startUrl); endUrl = pos; if (str[pos-1]=='"' || str[pos-1]=='\'') // CSS 'feature' endUrl--; TQString url = str.mid(startUrl, endUrl-startUrl); #ifdef DEBUG_WAR kdDebug () << "url: " << url << endl; #endif url = handleLink(_url, url); #ifdef DEBUG_WAR kdDebug () << "url: " << url << endl; #endif str = str.replace(startUrl, endUrl-startUrl, url); pos++; } } return str; } #include "archivedialog.moc"