#!/usr/local/bin/perl
use
Cwd
qw(cwd abs_path)
;
my
(
$Help
,
$Force
,
$In
,
$Out
,
$CutChars
,
$XsdPath
,
$ImagesPath
,
$ImageFileName
,
$WorkType
);
my
$r
= GetOptions(
'help|h'
=> \
$Help
,
'force|f'
=> \
$Force
,
'in|i=s'
=> \
$In
,
'out|o=s'
=> \
$Out
,
'chars|c=i'
=> \
$CutChars
,
'xsd|x=s'
=> \
$XsdPath
,
'imagespath|p=s'
=> \
$ImagesPath
,
'type|t=s'
=> \
$WorkType
,
);
help()
if
$Help
;
unless
(
$In
and
$Out
) {
print
"\nrequired params 'in'/'out' not defined\n"
;
help();
}
if
(
$ImagesPath
) {
die
"Path $ImagesPath not found\n"
unless
-d
$ImagesPath
;
$ImagesPath
=
$ImagesPath
.
'/'
if
$ImagesPath
!~ /\/$/;
}
my
$InTrial
;
if
( not
$WorkType
) {
$WorkType
=
'trial'
;
}
elsif
(
$WorkType
!~ /^(trial|output)$/ ) {
print
"\nparams 'type' must be trial|output\n"
;
help();
}
DEFAULT_TRIAL_PERCENT
=> 24,
};
my
$Blocks
=
qr/^(p|subtitle|ol|ul|pre|table|poem|blockquote|div|subscription)$/
;
RELATION_TYPE_FB3_BOOK
=>
RELATION_TYPE_FB3_BODY
=>
RELATION_TYPE_OPC_THUMBNAIL
=>
RELATION_TYPE_CORE_PROP
=>
RELATION_TYPE_FB3_IMAGES
=>
};
our
$Validator
= FB3::Validator->new(
$XsdPath
);
my
$Finish
;
my
$CharsProcessed
= 0;
my
%ImageHash
;
my
%HrefHash
;
my
%IdHash
;
my
%NoteHash
;
my
%NoteIdHash
;
my
$TmpDir
= File::Temp::tempdir(
CLEANUP
=> 1 );
my
$FB3TmpDir
=
$TmpDir
.
'/unzipped_'
.File::Basename::basename(
$In
);
my
$UnzipMessage
= `/usr/bin/unzip -o -d
$FB3TmpDir
$In
2>&1`;
die
"Unzipping fb3 error: $UnzipMessage"
if
$?;
my
$FB3Package
= FB3->new(
from_dir
=>
$FB3TmpDir
);
my
$FB3Body
=
$FB3Package
->Body;
my
$BodyXML
=
$FB3Body
->Content;
my
$Parser
= XML::LibXML->new();
my
$BodyDoc
=
$Parser
->load_xml(
string
=>
$BodyXML
,
huge
=> 1 );
my
$RootNode
=
$BodyDoc
->getDocumentElement();
my
$XPC
= XML::LibXML::XPathContext->new(
$RootNode
);
$XPC
->registerNs(
'fb'
,
&NS_FB3_BODY
);
$XPC
->registerNs(
'fbd'
,
&NS_FB3_DESCRIPTION
);
my
$CharsFull
;
if
(
$WorkType
eq
'trial'
) {
$CharsFull
=
length
(
$RootNode
->textContent);
foreach
my
$TrialOnlyNode
(
$XPC
->findnodes(
'/fb:fb3-body/fb:section[@output="trial-only"]'
,
$RootNode
)) {
$CharsFull
-= (
length
(
$TrialOnlyNode
->textContent) || 0);
}
$CutChars
||= DEFAULT_TRIAL_PERCENT * 0.01 *
$CharsFull
;
ProceedNodeTrial(
$RootNode
);
}
elsif
(
$WorkType
eq
'output'
) {
ProceedNodeOut(
$RootNode
);
}
CollectElementStat(
$RootNode
);
CleanLinks(
$FB3Body
);
CleanImages(
$FB3Package
);
CleanNotes(
$FB3Package
);
$BodyDoc
->toFile(
$FB3Body
->PhysicalName, 0);
if
(
$WorkType
eq
'trial'
) {
my
$CharsTrial
=
length
(
$RootNode
->textContent);
my
$FB3Descr
=
$FB3Package
->Meta;
my
$DescrXML
=
$FB3Descr
->Content;
my
$DescrDoc
=
$Parser
->load_xml(
string
=>
$DescrXML
,
huge
=> 1 );
my
$DescrNode
=
$DescrDoc
->getDocumentElement();
if
(
my
$FragNode
=
$XPC
->findnodes(
"/fbd:fb3-description/fbd:fb3-fragment"
,
$DescrDoc
)->[0] ) {
$FragNode
->unbindNode();
}
my
$FB3FragmentNode
=
$DescrDoc
->createElement(
'fb3-fragment'
);
$FB3FragmentNode
->setAttribute(
'full_length'
,
$CharsFull
);
$FB3FragmentNode
->setAttribute(
'fragment_length'
,
$CharsTrial
);
$DescrNode
->appendChild(
$FB3FragmentNode
);
$DescrDoc
->toFile(
$FB3Descr
->PhysicalName, 0);
}
die
"Empty body in result file"
unless
scalar
@{
$XPC
->findnodes(
"/fb:fb3-body/fb:section"
,
$RootNode
)};
ZipFolder(
"$FB3TmpDir/"
,
$Out
);
if
(
my
$ValidationError
=
$Validator
->Validate(
$Out
) ) {
unless
(
$Force
) {
unlink
$Out
;
die
"Validation error: $ValidationError"
;
}
}
sub
CollectElementStat {
my
$Node
=
shift
||
return
;
for
my
$ChildNode
(
$Node
->childNodes) {
next
if
$ChildNode
->nodeName eq
'#text'
;
CollectElementStat(
$ChildNode
);
$ImageHash
{
$ChildNode
->getAttribute(
'src'
)} = 1
if
$ChildNode
->nodeName eq
'img'
;
$NoteHash
{
$ChildNode
->getAttribute(
'href'
)} = 1
if
$ChildNode
->nodeName eq
'note'
;
if
(
my
$Id
=
$ChildNode
->getAttribute(
'id'
)) {
if
(
$ChildNode
->nodeName eq
'notebody'
) {
$NoteIdHash
{
$Id
} = 1;
}
else
{
$IdHash
{
$Id
} = 1;
}
}
if
(
$ChildNode
->nodeName eq
'a'
) {
my
$Href
=
$ChildNode
->getAttribute(
'xlink:href'
);
$HrefHash
{
$Href
} = 1
if
(
$Href
and
$Href
=~ s/^
}
}
}
sub
ProceedNodeOut {
my
$Node
=
shift
||
return
;
for
my
$ChildNode
(
$Node
->childNodes) {
next
unless
$ChildNode
->nodeName eq
'section'
;
my
$Output
=
$ChildNode
->getAttribute(
'output'
);
$ChildNode
->unbindNode()
if
(
$Output
and
$Output
eq
'trial-only'
);
}
}
sub
ProceedNodeTrial {
my
$Node
=
shift
||
return
;
my
$ImmortalBranch
=
shift
|| 0;
my
$NodeName
=
$Node
->nodeName;
if
(
$NodeName
eq
'#text'
) {
$CharsProcessed
+=
length
(
$Node
->nodeValue);
$Node
->unbindNode()
if
(
$Finish
&& !
$ImmortalBranch
);
return
;
}
my
$OuterImmortal
;
if
(
$NodeName
eq
'section'
) {
if
(
$Node
->parentNode->nodeName eq
'fb3-body'
&&
defined
$Node
->getAttribute(
'output'
)
) {
my
$SectionOutput
=
$Node
->getAttribute(
'output'
);
if
(
$SectionOutput
eq
'payed'
) {
$Node
->unbindNode();
return
$Node
;
}
if
(
$SectionOutput
eq
'trial'
||
$SectionOutput
eq
'trial-only'
) {
$OuterImmortal
= 1;
$ImmortalBranch
= 1;
$InTrial
= 1
if
$WorkType
eq
'trial'
;
}
elsif
(
$InTrial
) {
$InTrial
= 0;
$Finish
= 1;
}
}
$Node
->setAttribute(
'first-char-pos'
,
$CharsProcessed
+1);
}
elsif
(
$NodeName
eq
'title'
&& (
$Node
->parentNode->nodeName eq
'section'
||
$Node
->parentNode->nodeName eq
'notes'
)) {
$ImmortalBranch
= 1;
}
elsif
(
$NodeName
eq
'img'
&& (!
$Finish
||
$ImmortalBranch
) ) {
my
$ImgSrc
=
$Node
->getAttribute(
'src'
);
$ImageHash
{
$ImgSrc
} = 1
if
$ImgSrc
;
}
elsif
(
$NodeName
eq
'note'
&& (!
$Finish
||
$ImmortalBranch
) ) {
my
$NoteHref
=
$Node
->getAttribute(
'href'
);
$NoteHash
{
$NoteHref
} = 1
if
$NoteHref
;
}
elsif
(
$NodeName
eq
'notebody'
) {
if
(!
$NoteHash
{
$Node
->getAttribute(
'id'
)}) {
$Node
->unbindNode();
}
else
{
$ImmortalBranch
= 1;
}
}
for
my
$ChildNode
(
$Node
->childNodes) {
if
(
$ChildNode
->nodeName =~ /
$Blocks
/) {
if
( (
$CharsProcessed
+
length
(
$ChildNode
->textContent)) >=
$CutChars
) {
$Finish
= 1;
}
}
ProceedNodeTrial(
$ChildNode
,
$ImmortalBranch
);
}
if
(
$NodeName
eq
'section'
) {
my
@SectionChildren
=
$Node
->nonBlankChildNodes;
$Node
->setAttribute(
'clipped'
,
'true'
)
if
$Finish
&& !
$OuterImmortal
;
if
(
scalar
@SectionChildren
== 0 || (
scalar
@SectionChildren
== 1 &&
$SectionChildren
[0]->nodeName eq
'title'
)
|| (
scalar
@SectionChildren
== 2 &&
$SectionChildren
[0]->nodeName eq
'title'
&& (
$SectionChildren
[1]->nodeName eq
'epigraph'
||
$SectionChildren
[1]->nodeName eq
'annotation'
))
) {
$SectionChildren
[1]->unbindNode()
if
(
$SectionChildren
[1] && (
$SectionChildren
[1]->nodeName eq
'epigraph'
||
$SectionChildren
[1]->nodeName eq
'annotation'
));
my
$ClippedNode
= XML::LibXML::Element->new(
'clipped'
);
$Node
->appendChild(
$ClippedNode
);
}
}
if
(
$NodeName
eq
'image'
) {
$ImageFileName
=
$Node
->getAttribute(
'href'
);
$ImageFileName
=~ s/^
if
( -f
$ImagesPath
.
$ImageFileName
) {
my
(
$x
,
$y
) = imgsize(
$ImagesPath
.
$ImageFileName
);
$CharsProcessed
+= 200
if
(
$x
&&
$y
&&
$x
>= 100 &&
$y
>= 100);
}
}
if
(
$Finish
&& !
$ImmortalBranch
&& (!
$Node
->firstChild || Trim(InNode(
$Node
)) eq
''
) ) {
$Node
->unbindNode();
}
return
$Node
;
}
sub
InNode {
my
$Node
=
shift
;
join
""
,
map
{
$_
->toString}
$Node
->childNodes;
}
sub
Trim {
my
$Str
=
shift
;
$Str
=~ s/[\n\r]//g;
$Str
=~ s/^\s+//g;
$Str
=~ s/\s+$//g;
return
$Str
;
}
sub
NReplace {
my
$Node
=
shift
;
my
$Parent
=
$Node
->parentNode();
my
$Clone
=
$BodyDoc
->createElement(
$Parent
->nodeName());
foreach
(
$Parent
->getAttributes) {
$Clone
->setAttribute(
$_
->
nodeName
=>
$_
->value);
}
foreach
my
$PNode
(
$Parent
->childNodes()) {
if
(
$PNode
==
$Node
) {
foreach
(
$PNode
->childNodes) {
$Clone
->appendChild(
$_
);
}
next
;
}
$Clone
->appendChild(
$PNode
);
}
$Parent
->replaceNode(
$Clone
);
}
sub
CleanLinks {
my
$FB3Body
=
shift
;
foreach
my
$Id
(
keys
%NoteIdHash
) {
next
if
exists
$NoteHash
{
$Id
};
my
$Ids
=
$XPC
->findnodes(
'/fb:fb3-body/fb:notes/fb:notebody[@id="'
.
$Id
.
'"]'
);
foreach
my
$NodeIds
(
@$Ids
) {
$NodeIds
->unbindNode();
}
}
foreach
my
$Id
(
keys
%IdHash
) {
next
if
exists
$HrefHash
{
$Id
};
my
$Ids
=
$XPC
->findnodes(
'//*[@id="'
.
$Id
.
'"]'
);
foreach
my
$NodeIds
(
@$Ids
) {
next
if
$NodeIds
->nodeName() eq
'section'
;
next
if
$NodeIds
->nodeName() eq
'notebody'
;
if
(
$NodeIds
->nodeName eq
'span'
) {
NReplace(
$NodeIds
);
}
else
{
$NodeIds
->removeAttribute(
'id'
);
}
}
delete
$IdHash
{
$Id
};
}
foreach
my
$Href
(
keys
%HrefHash
) {
next
if
exists
$IdHash
{
$Href
};
my
$Hrefs
=
$XPC
->findnodes(
'//fb:a[@xlink:href="#'
.
$Href
.
'"]'
);
foreach
my
$NodeHrefs
(
@$Hrefs
) {
if
(
$NodeHrefs
->getAttribute(
'id'
)) {
$NodeHrefs
->removeAttribute(
'xlink:href'
);
$NodeHrefs
->setNodeName(
'span'
);
next
;
}
NReplace(
$NodeHrefs
);
}
delete
$HrefHash
{
$Href
};
}
}
sub
CleanNotes {
my
$Node
=
$XPC
->findnodes(
"/fb:fb3-body/fb:notes"
,
$RootNode
)->[0];
return
unless
$Node
;
my
@NotesChildren
=
$Node
->nonBlankChildNodes;
if
(
scalar
@NotesChildren
== 0
|| (
scalar
@NotesChildren
== 1 &&
$NotesChildren
[0]->nodeName eq
'title'
)
|| (
scalar
@NotesChildren
== 2 &&
$NotesChildren
[0]->nodeName eq
'title'
&&
$NotesChildren
[1]->nodeName eq
'epigraph'
)
) {
$Node
->unbindNode();
}
}
sub
CleanImages {
my
$FB3
=
shift
;
my
$FB3Body
=
$FB3
->Body;
my
%TrialImageFiles
;
my
@FilePathes
;
my
$cover
=
eval
{
$FB3
->Cover->PhysicalName };
$TrialImageFiles
{
$cover
} = 1
if
$cover
;
my
@ImagesAll
=
$FB3Body
->Relations(
type
=> RELATION_TYPE_FB3_IMAGES);
foreach
(
@ImagesAll
) {
my
$NewPart
=
$FB3
->Part(
$_
->{TargetFullName} );
my
$FullPath
=
$NewPart
->PhysicalName;
push
(
@FilePathes
,
$FullPath
);
if
(
$ImageHash
{
$_
->{Id}}) {
$TrialImageFiles
{
$FullPath
} = 1;
}
else
{
$FB3Body
->RemoveRelations(
id
=>
$_
->{Id} );
}
}
foreach
(
@FilePathes
) {
unlink
$_
unless
$TrialImageFiles
{
$_
};
}
return
1;
}
sub
ZipFolder {
my
(
$SourceFileName
,
$TargetFileName
,
@first
) =
@_
;
open
my
$FH
,
'>'
,
"$TargetFileName"
;
close
$FH
;
my
$fn_abs
= abs_path(
"$TargetFileName"
);
unlink
"$TargetFileName"
;
my
$old_dir
= cwd();
chdir
"$SourceFileName"
;
my
$cmd
=
"zip -Xr9Dq $fn_abs "
.
join
(
' '
,
@first
) .
" *"
;
my
$CmdResult
= `
$cmd
`;
warn
$CmdResult
if
$CmdResult
;
chdir
$old_dir
if
$old_dir
;
}
sub
help {
print
qq{
USAGE: cutfb3.pl --in <input.file> --out <output.file> [options]
--chars|c : maximum caracters in trial fragment. default is @{[DEFAULT_TRIAL_PERCENT]}
% of text.
--imagespath : /path/to/fb3/images
--type|t : type of work `trial
' or `output'
.
default
is `trial'
--help|h :
print
this text
};
exit
;
}
1;