From e87c9cb3b816a92a4ab1bceec36420b038e9a615 Mon Sep 17 00:00:00 2001 From: zed Date: Sat, 25 Oct 2014 13:18:41 +0300 Subject: [PATCH] Add option for specifying the text encoding used by Mercurial When a mercurial repository does not use utf-8 for encoding author strings and commit messages the "-e " command line option can be used to force fast-export to convert incoming meta data from to utf-8. When "-e " is given, we use Python's string decoding/encoding API to convert meta data on the fly when processing commits. --- README | 5 +++++ hg-fast-export.py | 20 ++++++++++++++------ hg-fast-export.sh | 4 +++- hg2git.py | 5 ++++- 4 files changed, 26 insertions(+), 8 deletions(-) diff --git a/README b/README index 99fde49..d2be056 100644 --- a/README +++ b/README @@ -34,6 +34,11 @@ hg-fast-export'ed from mercurial: will give hints on which branches need adjustment for starting over again. +When a mercurial repository does not use utf-8 for encoding author +strings and commit messages the "-e " command line option +can be used to force fast-export to convert incoming meta data from + to utf-8. + As mercurial appears to be much less picky about the syntax of the author information than git, an author mapping file can be given to hg-fast-export to fix up malformed author strings. The file is diff --git a/hg-fast-export.py b/hg-fast-export.py index 4ed42e4..b693364 100755 --- a/hg-fast-export.py +++ b/hg-fast-export.py @@ -159,7 +159,7 @@ def sanitize_name(name,what="branch"): sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n)) return n -def export_commit(ui,repo,revision,old_marks,max,count,authors,sob,brmap,hgtags,notes): +def export_commit(ui,repo,revision,old_marks,max,count,authors,sob,brmap,hgtags,notes,encoding=''): def get_branchname(name): if brmap.has_key(name): return brmap[name] @@ -167,7 +167,7 @@ def export_commit(ui,repo,revision,old_marks,max,count,authors,sob,brmap,hgtags, brmap[name]=n return n - (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors) + (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors,encoding) branch=get_branchname(branch) @@ -323,7 +323,7 @@ def verify_heads(ui,repo,cache,force): return True -def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=False,force=False,hgtags=False,notes=False): +def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=False,force=False,hgtags=False,notes=False,encoding=''): _max=int(m) old_marks=load_cache(marksfile,lambda s: int(s)-1) @@ -354,7 +354,7 @@ def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=Fals c=0 brmap={} for rev in range(min,max): - c=export_commit(ui,repo,rev,old_marks,max,c,authors,sob,brmap,hgtags,notes) + c=export_commit(ui,repo,rev,old_marks,max,c,authors,sob,brmap,hgtags,notes,encoding) state_cache['tip']=max state_cache['repo']=repourl @@ -401,6 +401,8 @@ if __name__=='__main__': help="use as namespace to track upstream") parser.add_option("--hg-hash",action="store_true",dest="notes", default=False,help="Annotate commits with the hg hash as git notes in the hg namespace") + parser.add_option("-e",dest="encoding", + help="Assume commit and author strings retrieved from Mercurial are encoded in ") (options,args)=parser.parse_args() @@ -423,5 +425,11 @@ if __name__=='__main__': if options.origin_name!=None: set_origin_name(options.origin_name) - sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,options.headsfile, - options.statusfile,authors=a,sob=options.sob,force=options.force,hgtags=options.hgtags,notes=options.notes)) + encoding='' + if options.encoding!=None: + encoding=options.encoding + + sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile, + options.headsfile, options.statusfile,authors=a, + sob=options.sob,force=options.force,hgtags=options.hgtags, + notes=options.notes,encoding=encoding)) diff --git a/hg-fast-export.sh b/hg-fast-export.sh index 3747fc6..05bbec9 100755 --- a/hg-fast-export.sh +++ b/hg-fast-export.sh @@ -13,7 +13,7 @@ SFX_STATE="state" GFI_OPTS="" PYTHON=${PYTHON:-python} -USAGE="[--quiet] [-r ] [--force] [-m ] [-s] [--hgtags] [-A ] [-M ] [-o ] [--hg-hash]" +USAGE="[--quiet] [-r ] [--force] [-m ] [-s] [--hgtags] [-A ] [-M ] [-o ] [--hg-hash] [-e ]" LONG_USAGE="Import hg repository up to either tip or If is omitted, use last hg repository as obtained from state file, GIT_DIR/$PFX-$SFX_STATE by default. @@ -34,6 +34,8 @@ Options: -o Use as branch namespace to track upstream (eg 'origin') --hg-hash Annotate commits with the hg hash as git notes in the hg namespace. + -e Assume commit and author strings retrieved from + Mercurial are encoded in " case "$1" in -h|--help) diff --git a/hg2git.py b/hg2git.py index dedfd95..c58cade 100755 --- a/hg2git.py +++ b/hg2git.py @@ -67,9 +67,12 @@ def get_branch(name): return origin_name + '/' + name return name -def get_changeset(ui,repo,revision,authors={}): +def get_changeset(ui,repo,revision,authors={},encoding=''): node=repo.lookup(revision) (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node) + if encoding: + user=user.decode(encoding).encode('utf8') + desc=desc.decode(encoding).encode('utf8') tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60)) branch=get_branch(extra.get('branch','master')) return (node,manifest,fixup_user(user,authors),(time,tz),files,desc,branch,extra) -- 2.11.0