From: Rocco Rutte Date: Wed, 14 Mar 2007 10:29:24 +0000 (+0000) Subject: Use MIT license, adjust hg2git script names to match fast-export repo style X-Git-Url: http://crossforests.com/gitweb?a=commitdiff_plain;h=c84790da823533320be3cb60e03680564ad37ec5;p=python%2Ffast-export.git Use MIT license, adjust hg2git script names to match fast-export repo style Signed-off-by: Rocco Rutte --- diff --git a/hg-fast-export.py b/hg-fast-export.py new file mode 100644 index 0000000..b876ce4 --- /dev/null +++ b/hg-fast-export.py @@ -0,0 +1,450 @@ +#!/usr/bin/env python + +# Copyright (c) 2007 Rocco Rutte +# License: MIT + +"""hg-fast-export.py - A mercurial-to-git filter for git-fast-import(1) +Usage: hg-fast-export.py +""" + +from mercurial import repo,hg,cmdutil,util,ui,revlog,node +from tempfile import mkstemp +from optparse import OptionParser +import re +import sys +import os + +# silly regex to catch Signed-off-by lines in log message +sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$') +# silly regex to see if user field has email address +user_re=re.compile('([^<]+) (<[^>]+>)$') +# silly regex to clean out user names +user_clean_re=re.compile('^["]([^"]+)["]$') +# git branch for hg's default 'HEAD' branch +cfg_master='master' +# insert 'checkpoint' command after this many commits or none at all if 0 +cfg_checkpoint_count=0 +# write some progress message every this many file contents written +cfg_export_boundary=1000 + +def usage(ret): + sys.stderr.write(__doc__) + return ret + +def setup_repo(url): + myui=ui.ui() + return myui,hg.repository(myui,url) + +def fixup_user(user,authors): + if authors!=None: + # if we have an authors table, try to get mapping + # by defaulting to the current value of 'user' + user=authors.get(user,user) + name,mail,m='','',user_re.match(user) + if m==None: + # if we don't have 'Name ' syntax, use 'user + # ' if use contains no at and + # 'user ' otherwise + name=user + if '@' not in user: + mail='' + else: + mail='<%s>' % user + else: + # if we have 'Name ' syntax, everything is fine :) + name,mail=m.group(1),m.group(2) + + # remove any silly quoting from username + m2=user_clean_re.match(name) + if m2!=None: + name=m2.group(1) + return '%s %s' % (name,mail) + +def get_branch(name): + if name=='HEAD': + name=cfg_master + return name + +def get_changeset(ui,repo,revision,authors={}): + node=repo.lookup(revision) + (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node) + tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60)) + branch=get_branch(extra.get('branch','master')) + return (node,manifest,fixup_user(user,authors),(time,tz),files,desc,branch,extra) + +def gitmode(x): + return x and '100755' or '100644' + +def wr(msg=''): + print msg + #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n')) + +def checkpoint(count): + count=count+1 + if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0: + sys.stderr.write("Checkpoint after %d commits\n" % count) + wr('checkpoint') + wr() + return count + +def get_parent_mark(parent,marks): + """Get the mark for some parent. + If we saw it in the current session, return :%d syntax and + otherwise the SHA1 from the cache.""" + return marks.get(str(parent+1),':%d' % (parent+1)) + +def mismatch(f1,f2): + """See if two revisions of a file are not equal.""" + return node.hex(f1)!=node.hex(f2) + +def outer_set(dleft,dright,l,c,r): + """Loop over our repository and find all changed and missing files.""" + for left in dleft.keys(): + right=dright.get(left,None) + if right==None: + # we have the file but our parent hasn't: add to left set + l.append(left) + elif mismatch(dleft[left],right): + # we have it but checksums mismatch: add to center set + c.append(left) + for right in dright.keys(): + left=dleft.get(right,None) + if left==None: + # if parent has file but we don't: add to right set + r.append(right) + # change is already handled when comparing child against parent + return l,c,r + +def get_filechanges(repo,revision,parents,mleft): + """Given some repository and revision, find all changed/deleted files.""" + l,c,r=[],[],[] + for p in parents: + if p<0: continue + mright=repo.changectx(p).manifest() + dleft=mleft.keys() + dleft.sort() + dright=mright.keys() + dright.sort() + l,c,r=outer_set(mleft,mright,l,c,r) + return l,c,r + +def get_author(logmessage,committer,authors): + """As git distincts between author and committer of a patch, try to + extract author by detecting Signed-off-by lines. + + This walks from the end of the log message towards the top skipping + empty lines. Upon the first non-empty line, it walks all Signed-off-by + lines upwards to find the first one. For that (if found), it extracts + authorship information the usual way (authors table, cleaning, etc.) + + If no Signed-off-by line is found, this defaults to the committer. + + This may sound stupid (and it somehow is), but in log messages we + accidentially may have lines in the middle starting with + "Signed-off-by: foo" and thus matching our detection regex. Prevent + that.""" + + loglines=logmessage.split('\n') + i=len(loglines) + # from tail walk to top skipping empty lines + while i>=0: + i-=1 + if len(loglines[i].strip())==0: continue + break + if i>=0: + # walk further upwards to find first sob line, store in 'first' + first=None + while i>=0: + m=sob_re.match(loglines[i]) + if m==None: break + first=m + i-=1 + # if the last non-empty line matches our Signed-Off-by regex: extract username + if first!=None: + r=fixup_user(first.group(1),authors) + return r + return committer + +def export_file_contents(ctx,manifest,files): + count=0 + files.sort() + max=len(files) + for file in files: + fctx=ctx.filectx(file) + d=fctx.data() + wr('M %s inline %s' % (gitmode(manifest.execf(file)),file)) + wr('data %d' % len(d)) # had some trouble with size() + wr(d) + count+=1 + if count%cfg_export_boundary==0: + sys.stderr.write('Exported %d/%d files\n' % (count,max)) + if max>cfg_export_boundary: + sys.stderr.write('Exported %d/%d files\n' % (count,max)) + +def is_merge(parents): + c=0 + for parent in parents: + if parent>=0: + c+=1 + return c>1 + +def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob): + (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors) + parents=repo.changelog.parentrevs(revision) + + wr('commit refs/heads/%s' % branch) + wr('mark :%d' % (revision+1)) + if sob: + wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone)) + wr('committer %s %d %s' % (user,time,timezone)) + wr('data %d' % (len(desc)+1)) # wtf? + wr(desc) + wr() + + src=heads.get(branch,'') + link='' + if src!='': + # if we have a cached head, this is an incremental import: initialize it + # and kill reference so we won't init it again + wr('from %s' % src) + heads[branch]='' + sys.stderr.write('Initializing branch [%s] to parent [%s]\n' % + (branch,src)) + link=src # avoid making a merge commit for incremental import + elif link=='' and not heads.has_key(branch) and revision>0: + # newly created branch and not the first one: connect to parent + tmp=get_parent_mark(parents[0],marks) + wr('from %s' % tmp) + sys.stderr.write('Link new branch [%s] to parent [%s]\n' % + (branch,tmp)) + link=tmp # avoid making a merge commit for branch fork + + if parents: + l=last.get(branch,revision) + for p in parents: + # 1) as this commit implicitely is the child of the most recent + # commit of this branch, ignore this parent + # 2) ignore nonexistent parents + # 3) merge otherwise + if p==l or p==revision or p<0: + continue + tmp=get_parent_mark(p,marks) + # if we fork off a branch, don't merge with our parent via 'merge' + # as we have 'from' already above + if tmp==link: + continue + sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' % + (branch,tmp,p)) + wr('merge %s' % tmp) + + last[branch]=revision + heads[branch]='' + # we need this later to write out tags + marks[str(revision)]=':%d'%(revision+1) + + ctx=repo.changectx(str(revision)) + man=ctx.manifest() + added,changed,removed,type=[],[],[],'' + + if revision==0: + # first revision: feed in full manifest + added=man.keys() + type='full' + elif is_merge(parents): + # later merge revision: feed in changed manifest + # for many files comparing checksums is expensive so only do it for + # merges where we really need it due to hg's revlog logic + added,changed,removed=get_filechanges(repo,revision,parents,man) + type='thorough delta' + else: + # later non-merge revision: feed in changed manifest + # if we have exactly one parent, just take the changes from the + # manifest without expensively comparing checksums + f=repo.status(repo.lookup(parents[0]),revnode)[:3] + added,changed,removed=f[1],f[0],f[2] + type='simple delta' + + sys.stderr.write('Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' % + (type,revision+1,max,len(added),len(changed),len(removed))) + + map(lambda r: wr('D %s' % r),removed) + export_file_contents(ctx,man,added+changed) + wr() + + return checkpoint(count) + +def export_tags(ui,repo,marks_cache,start,end,count,authors): + l=repo.tagslist() + for tag,node in l: + # ignore latest revision + if tag=='tip': continue + rev=repo.changelog.rev(node) + # ignore those tags not in our import range + if rev=end: continue + + ref=get_parent_mark(rev,marks_cache) + if ref==None: + sys.stderr.write('Failed to find reference for creating tag' + ' %s at r%d\n' % (tag,rev)) + continue + sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref)) + wr('reset refs/tags/%s' % tag) + wr('from %s' % ref) + wr() + count=checkpoint(count) + return count + +def load_authors(filename): + cache={} + if not os.path.exists(filename): + return cache + f=open(filename,'r') + l=0 + lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$') + for line in f.readlines(): + l+=1 + m=lre.match(line) + if m==None: + sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l)) + continue + # put key:value in cache, key without ^: + cache[m.group(1).strip()]=m.group(2).strip() + f.close() + sys.stderr.write('Loaded %d authors\n' % l) + return cache + +def load_cache(filename): + cache={} + if not os.path.exists(filename): + return cache + f=open(filename,'r') + l=0 + for line in f.readlines(): + l+=1 + fields=line.split(' ') + if fields==None or not len(fields)==2 or fields[0][0]!=':': + sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l)) + continue + # put key:value in cache, key without ^: + cache[fields[0][1:]]=fields[1].split('\n')[0] + f.close() + return cache + +def save_cache(filename,cache): + f=open(filename,'w+') + map(lambda x: f.write(':%s %s\n' % (str(x),str(cache.get(x)))),cache.keys()) + f.close() + +def verify_heads(ui,repo,cache,force): + def getsha1(branch): + try: + f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch) + sha1=f.readlines()[0].split('\n')[0] + f.close() + return sha1 + except IOError: + return None + + branches=repo.branchtags() + l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()] + l.sort() + + # get list of hg's branches to verify, don't take all git has + for _,_,b in l: + b=get_branch(b) + sha1=getsha1(b) + c=cache.get(b) + if sha1!=None and c!=None: + sys.stderr.write('Verifying branch [%s]\n' % b) + if sha1!=c: + sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:' + '\n%s (repo) != %s (cache)\n' % (b,sha1,c)) + if not force: return False + + # verify that branch has exactly one head + t={} + for h in repo.heads(): + (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h) + if t.get(branch,False): + sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' % + repo.changelog.rev(h)) + if not force: return False + t[branch]=True + + return True + +def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False,force=False): + _max=int(m) + + marks_cache=load_cache(marksfile) + heads_cache=load_cache(headsfile) + state_cache=load_cache(tipfile) + + ui,repo=setup_repo(repourl) + + if not verify_heads(ui,repo,heads_cache,force): + return 1 + + tip=repo.changelog.count() + + min=int(state_cache.get('tip',0)) + max=_max + if _max<0: + max=tip + + c=0 + last={} + for rev in range(min,max): + c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,max,c,authors,sob) + + c=export_tags(ui,repo,marks_cache,min,max,c,authors) + + sys.stderr.write('Issued %d commands\n' % c) + + state_cache['tip']=max + state_cache['repo']=repourl + save_cache(tipfile,state_cache) + + return 0 + +if __name__=='__main__': + def bail(parser,opt): + sys.stderr.write('Error: No %s option given\n' % opt) + parser.print_help() + sys.exit(2) + + parser=OptionParser() + + parser.add_option("-m","--max",type="int",dest="max", + help="Maximum hg revision to import") + parser.add_option("--marks",dest="marksfile", + help="File to read git-fast-import's marks from") + parser.add_option("--heads",dest="headsfile", + help="File to read last run's git heads from") + parser.add_option("--status",dest="statusfile", + help="File to read status from") + parser.add_option("-r","--repo",dest="repourl", + help="URL of repo to import") + parser.add_option("-s",action="store_true",dest="sob", + default=False,help="Enable parsing Signed-off-by lines") + parser.add_option("-A","--authors",dest="authorfile", + help="Read authormap from AUTHORFILE") + parser.add_option("-f","--force",action="store_true",dest="force", + default=False,help="Ignore validation errors by force") + + (options,args)=parser.parse_args() + + m=-1 + if options.max!=None: m=options.max + + if options.marksfile==None: bail(parser,'--marks') + if options.marksfile==None: bail(parser,'--heads') + if options.marksfile==None: bail(parser,'--status') + if options.marksfile==None: bail(parser,'--repo') + + a={} + if options.authorfile!=None: + a=load_authors(options.authorfile) + + sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile, + options.statusfile,authors=a,sob=options.sob,force=options.force)) diff --git a/hg-fast-export.sh b/hg-fast-export.sh new file mode 100755 index 0000000..5cbf934 --- /dev/null +++ b/hg-fast-export.sh @@ -0,0 +1,96 @@ +#!/bin/sh + +# Copyright (c) 2007 Rocco Rutte +# License: MIT + +ROOT="`dirname $0`" +REPO="" +PFX="hg2git" +SFX_MARKS="marks" +SFX_HEADS="heads" +SFX_STATE="state" +QUIET="" + +USAGE="[--quiet] [-r ] [-m ] [-s] [-A ]" +LONG_USAGE="Import hg repository up to either tip or +If is omitted, use last hg repository as obtained from state file, +GIT_DIR/$PFX-$SFX_STATE by default. + +Note: The argument order matters. + +Options: + -m Maximum revision to import + --quiet Passed to git-fast-import(1) + -s Enable parsing Signed-off-by lines + -A Read author map from file + (Same as in git-svnimport(1) and git-cvsimport(1)) + -r Mercurial repository to import +" + +. git-sh-setup +cd_to_toplevel + +while case "$#" in 0) break ;; esac +do + case "$1" in + -r|--r|--re|--rep|--repo) + shift + REPO="$1" + ;; + --q|--qu|--qui|--quie|--quiet) + QUIET="--quiet" + ;; + -*) + # pass any other options down to hg2git.py + break + ;; + *) + break + ;; + esac + shift +done + +# for convenience: get default repo from state file +if [ x"$REPO" = x -a -f "$GIT_DIR/$PFX-$SFX_STATE" ] ; then + REPO="`egrep '^:repo ' "$GIT_DIR/$PFX-$SFX_STATE" | cut -d ' ' -f 2`" + echo "Using last hg repository \"$REPO\"" +fi + +# make sure we have a marks cache +if [ ! -f "$GIT_DIR/$PFX-$SFX_MARKS" ] ; then + touch "$GIT_DIR/$PFX-$SFX_MARKS" +fi + +GIT_DIR="$GIT_DIR" python "$ROOT/hg2git.py" \ + --repo "$REPO" \ + --marks "$GIT_DIR/$PFX-$SFX_MARKS" \ + --heads "$GIT_DIR/$PFX-$SFX_HEADS" \ + --status "$GIT_DIR/$PFX-$SFX_STATE" \ + "$@" \ +| git-fast-import $QUIET --export-marks="$GIT_DIR/$PFX-$SFX_MARKS.tmp" \ +|| die 'Git fast-import failed' + +# move recent marks cache out of the way... +if [ -f "$GIT_DIR/$PFX-$SFX_MARKS" ] ; then + mv "$GIT_DIR/$PFX-$SFX_MARKS" "$GIT_DIR/$PFX-$SFX_MARKS.old" +else + touch "$GIT_DIR/$PFX-$SFX_MARKS.old" +fi + +# ...to create a new merged one +cat "$GIT_DIR/$PFX-$SFX_MARKS.old" "$GIT_DIR/$PFX-$SFX_MARKS.tmp" \ +| uniq > "$GIT_DIR/$PFX-$SFX_MARKS" + +# cleanup +rm -rf "$GIT_DIR/$PFX-$SFX_MARKS.old" "$GIT_DIR/$PFX-$SFX_MARKS.tmp" + +# save SHA1s of current heads for incremental imports +# and connectivity (plus sanity checking) +for head in `git branch | sed 's#^..##'` ; do + id="`git-rev-parse $head`" + echo ":$head $id" +done > "$GIT_DIR/$PFX-$SFX_HEADS" + +# check diff with color: +# ( for i in `find . -type f | grep -v '\.git'` ; do diff -u $i $REPO/$i ; done | cdiff ) | less -r diff --git a/hg-fast-export.txt b/hg-fast-export.txt new file mode 100644 index 0000000..fdd8b9f --- /dev/null +++ b/hg-fast-export.txt @@ -0,0 +1,62 @@ +hg2git.(sh|py) - mercurial to git converter using git-fast-import + +Legal +===== + +The scripts are licensed under the MIT license[0] and were written by +Rocco Rutte with hints and help from the git list and +#mercurial on freenode. + +Usage +===== + +Using it is quite simple for a mercurial repository : + + mkdir repo-git # or whatever + cd repo-git + git init + hg2git.sh -r + +Incremental imports to track hg repos is supported, too. + +Notes/Limitations +================= + +hg2git supports multiple branches but only named branches with exaclty +one head each. Otherwise commits to the tip of these heads within branch +will get flattened into merge commits. + +As each git-fast-import run creates a new pack file, it may be required +to repack the repository quite often for incremental imports (especially +when importing a small number of changesets per incremental import). + +Design +====== + +hg2git.py was designed in a way that doesn't require a 2-pass mechanism +or any prior repository analysis: if just feeds what it finds into +git-fast-import. This also implies that it heavily relies on strictly +linear ordering of changesets from hg, i.e. its append-only storage +model so that changesets hg2git already saw never get modified. + +Todo +==== + +For incremental imports, handling tags needs to be reworked (maybe): +Right now we assume that once a tag is created, it stays forever and +never changes. However, + + 1) tags in hg may be removed + 2) tags may change + +I'm not yet sure how to handle this and how this interferes with +non-hg-based tags in git. + +The same for branches: They may get removed. + +For one-time conversions, everything is fine. + +Footnotes +========= + +[0] http://www.opensource.org/licenses/mit-license.php diff --git a/hg2git.py b/hg2git.py deleted file mode 100644 index eb927a4..0000000 --- a/hg2git.py +++ /dev/null @@ -1,450 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2007 Rocco Rutte -# License: GPLv2 - -"""hg2git.py - A mercurial-to-git filter for git-fast-import(1) -Usage: hg2git.py -""" - -from mercurial import repo,hg,cmdutil,util,ui,revlog,node -from tempfile import mkstemp -from optparse import OptionParser -import re -import sys -import os - -# silly regex to catch Signed-off-by lines in log message -sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$') -# silly regex to see if user field has email address -user_re=re.compile('([^<]+) (<[^>]+>)$') -# silly regex to clean out user names -user_clean_re=re.compile('^["]([^"]+)["]$') -# git branch for hg's default 'HEAD' branch -cfg_master='master' -# insert 'checkpoint' command after this many commits or none at all if 0 -cfg_checkpoint_count=0 -# write some progress message every this many file contents written -cfg_export_boundary=1000 - -def usage(ret): - sys.stderr.write(__doc__) - return ret - -def setup_repo(url): - myui=ui.ui() - return myui,hg.repository(myui,url) - -def fixup_user(user,authors): - if authors!=None: - # if we have an authors table, try to get mapping - # by defaulting to the current value of 'user' - user=authors.get(user,user) - name,mail,m='','',user_re.match(user) - if m==None: - # if we don't have 'Name ' syntax, use 'user - # ' if use contains no at and - # 'user ' otherwise - name=user - if '@' not in user: - mail='' - else: - mail='<%s>' % user - else: - # if we have 'Name ' syntax, everything is fine :) - name,mail=m.group(1),m.group(2) - - # remove any silly quoting from username - m2=user_clean_re.match(name) - if m2!=None: - name=m2.group(1) - return '%s %s' % (name,mail) - -def get_branch(name): - if name=='HEAD': - name=cfg_master - return name - -def get_changeset(ui,repo,revision,authors={}): - node=repo.lookup(revision) - (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node) - tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60)) - branch=get_branch(extra.get('branch','master')) - return (node,manifest,fixup_user(user,authors),(time,tz),files,desc,branch,extra) - -def gitmode(x): - return x and '100755' or '100644' - -def wr(msg=''): - print msg - #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n')) - -def checkpoint(count): - count=count+1 - if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0: - sys.stderr.write("Checkpoint after %d commits\n" % count) - wr('checkpoint') - wr() - return count - -def get_parent_mark(parent,marks): - """Get the mark for some parent. - If we saw it in the current session, return :%d syntax and - otherwise the SHA1 from the cache.""" - return marks.get(str(parent+1),':%d' % (parent+1)) - -def mismatch(f1,f2): - """See if two revisions of a file are not equal.""" - return node.hex(f1)!=node.hex(f2) - -def outer_set(dleft,dright,l,c,r): - """Loop over our repository and find all changed and missing files.""" - for left in dleft.keys(): - right=dright.get(left,None) - if right==None: - # we have the file but our parent hasn't: add to left set - l.append(left) - elif mismatch(dleft[left],right): - # we have it but checksums mismatch: add to center set - c.append(left) - for right in dright.keys(): - left=dleft.get(right,None) - if left==None: - # if parent has file but we don't: add to right set - r.append(right) - # change is already handled when comparing child against parent - return l,c,r - -def get_filechanges(repo,revision,parents,mleft): - """Given some repository and revision, find all changed/deleted files.""" - l,c,r=[],[],[] - for p in parents: - if p<0: continue - mright=repo.changectx(p).manifest() - dleft=mleft.keys() - dleft.sort() - dright=mright.keys() - dright.sort() - l,c,r=outer_set(mleft,mright,l,c,r) - return l,c,r - -def get_author(logmessage,committer,authors): - """As git distincts between author and committer of a patch, try to - extract author by detecting Signed-off-by lines. - - This walks from the end of the log message towards the top skipping - empty lines. Upon the first non-empty line, it walks all Signed-off-by - lines upwards to find the first one. For that (if found), it extracts - authorship information the usual way (authors table, cleaning, etc.) - - If no Signed-off-by line is found, this defaults to the committer. - - This may sound stupid (and it somehow is), but in log messages we - accidentially may have lines in the middle starting with - "Signed-off-by: foo" and thus matching our detection regex. Prevent - that.""" - - loglines=logmessage.split('\n') - i=len(loglines) - # from tail walk to top skipping empty lines - while i>=0: - i-=1 - if len(loglines[i].strip())==0: continue - break - if i>=0: - # walk further upwards to find first sob line, store in 'first' - first=None - while i>=0: - m=sob_re.match(loglines[i]) - if m==None: break - first=m - i-=1 - # if the last non-empty line matches our Signed-Off-by regex: extract username - if first!=None: - r=fixup_user(first.group(1),authors) - return r - return committer - -def export_file_contents(ctx,manifest,files): - count=0 - files.sort() - max=len(files) - for file in files: - fctx=ctx.filectx(file) - d=fctx.data() - wr('M %s inline %s' % (gitmode(manifest.execf(file)),file)) - wr('data %d' % len(d)) # had some trouble with size() - wr(d) - count+=1 - if count%cfg_export_boundary==0: - sys.stderr.write('Exported %d/%d files\n' % (count,max)) - if max>cfg_export_boundary: - sys.stderr.write('Exported %d/%d files\n' % (count,max)) - -def is_merge(parents): - c=0 - for parent in parents: - if parent>=0: - c+=1 - return c>1 - -def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob): - (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors) - parents=repo.changelog.parentrevs(revision) - - wr('commit refs/heads/%s' % branch) - wr('mark :%d' % (revision+1)) - if sob: - wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone)) - wr('committer %s %d %s' % (user,time,timezone)) - wr('data %d' % (len(desc)+1)) # wtf? - wr(desc) - wr() - - src=heads.get(branch,'') - link='' - if src!='': - # if we have a cached head, this is an incremental import: initialize it - # and kill reference so we won't init it again - wr('from %s' % src) - heads[branch]='' - sys.stderr.write('Initializing branch [%s] to parent [%s]\n' % - (branch,src)) - link=src # avoid making a merge commit for incremental import - elif link=='' and not heads.has_key(branch) and revision>0: - # newly created branch and not the first one: connect to parent - tmp=get_parent_mark(parents[0],marks) - wr('from %s' % tmp) - sys.stderr.write('Link new branch [%s] to parent [%s]\n' % - (branch,tmp)) - link=tmp # avoid making a merge commit for branch fork - - if parents: - l=last.get(branch,revision) - for p in parents: - # 1) as this commit implicitely is the child of the most recent - # commit of this branch, ignore this parent - # 2) ignore nonexistent parents - # 3) merge otherwise - if p==l or p==revision or p<0: - continue - tmp=get_parent_mark(p,marks) - # if we fork off a branch, don't merge with our parent via 'merge' - # as we have 'from' already above - if tmp==link: - continue - sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' % - (branch,tmp,p)) - wr('merge %s' % tmp) - - last[branch]=revision - heads[branch]='' - # we need this later to write out tags - marks[str(revision)]=':%d'%(revision+1) - - ctx=repo.changectx(str(revision)) - man=ctx.manifest() - added,changed,removed,type=[],[],[],'' - - if revision==0: - # first revision: feed in full manifest - added=man.keys() - type='full' - elif is_merge(parents): - # later merge revision: feed in changed manifest - # for many files comparing checksums is expensive so only do it for - # merges where we really need it due to hg's revlog logic - added,changed,removed=get_filechanges(repo,revision,parents,man) - type='thorough delta' - else: - # later non-merge revision: feed in changed manifest - # if we have exactly one parent, just take the changes from the - # manifest without expensively comparing checksums - f=repo.status(repo.lookup(parents[0]),revnode)[:3] - added,changed,removed=f[1],f[0],f[2] - type='simple delta' - - sys.stderr.write('Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' % - (type,revision+1,max,len(added),len(changed),len(removed))) - - map(lambda r: wr('D %s' % r),removed) - export_file_contents(ctx,man,added+changed) - wr() - - return checkpoint(count) - -def export_tags(ui,repo,marks_cache,start,end,count,authors): - l=repo.tagslist() - for tag,node in l: - # ignore latest revision - if tag=='tip': continue - rev=repo.changelog.rev(node) - # ignore those tags not in our import range - if rev=end: continue - - ref=get_parent_mark(rev,marks_cache) - if ref==None: - sys.stderr.write('Failed to find reference for creating tag' - ' %s at r%d\n' % (tag,rev)) - continue - sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref)) - wr('reset refs/tags/%s' % tag) - wr('from %s' % ref) - wr() - count=checkpoint(count) - return count - -def load_authors(filename): - cache={} - if not os.path.exists(filename): - return cache - f=open(filename,'r') - l=0 - lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$') - for line in f.readlines(): - l+=1 - m=lre.match(line) - if m==None: - sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l)) - continue - # put key:value in cache, key without ^: - cache[m.group(1).strip()]=m.group(2).strip() - f.close() - sys.stderr.write('Loaded %d authors\n' % l) - return cache - -def load_cache(filename): - cache={} - if not os.path.exists(filename): - return cache - f=open(filename,'r') - l=0 - for line in f.readlines(): - l+=1 - fields=line.split(' ') - if fields==None or not len(fields)==2 or fields[0][0]!=':': - sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l)) - continue - # put key:value in cache, key without ^: - cache[fields[0][1:]]=fields[1].split('\n')[0] - f.close() - return cache - -def save_cache(filename,cache): - f=open(filename,'w+') - map(lambda x: f.write(':%s %s\n' % (str(x),str(cache.get(x)))),cache.keys()) - f.close() - -def verify_heads(ui,repo,cache,force): - def getsha1(branch): - try: - f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch) - sha1=f.readlines()[0].split('\n')[0] - f.close() - return sha1 - except IOError: - return None - - branches=repo.branchtags() - l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()] - l.sort() - - # get list of hg's branches to verify, don't take all git has - for _,_,b in l: - b=get_branch(b) - sha1=getsha1(b) - c=cache.get(b) - if sha1!=None and c!=None: - sys.stderr.write('Verifying branch [%s]\n' % b) - if sha1!=c: - sys.stderr.write('Error: Branch [%s] modified outside hg2git:' - '\n%s (repo) != %s (cache)\n' % (b,sha1,c)) - if not force: return False - - # verify that branch has exactly one head - t={} - for h in repo.heads(): - (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h) - if t.get(branch,False): - sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' % - repo.changelog.rev(h)) - if not force: return False - t[branch]=True - - return True - -def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False,force=False): - _max=int(m) - - marks_cache=load_cache(marksfile) - heads_cache=load_cache(headsfile) - state_cache=load_cache(tipfile) - - ui,repo=setup_repo(repourl) - - if not verify_heads(ui,repo,heads_cache,force): - return 1 - - tip=repo.changelog.count() - - min=int(state_cache.get('tip',0)) - max=_max - if _max<0: - max=tip - - c=0 - last={} - for rev in range(min,max): - c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,max,c,authors,sob) - - c=export_tags(ui,repo,marks_cache,min,max,c,authors) - - sys.stderr.write('Issued %d commands\n' % c) - - state_cache['tip']=max - state_cache['repo']=repourl - save_cache(tipfile,state_cache) - - return 0 - -if __name__=='__main__': - def bail(parser,opt): - sys.stderr.write('Error: No %s option given\n' % opt) - parser.print_help() - sys.exit(2) - - parser=OptionParser() - - parser.add_option("-m","--max",type="int",dest="max", - help="Maximum hg revision to import") - parser.add_option("--marks",dest="marksfile", - help="File to read git-fast-import's marks from") - parser.add_option("--heads",dest="headsfile", - help="File to read last run's git heads from") - parser.add_option("--status",dest="statusfile", - help="File to read status from") - parser.add_option("-r","--repo",dest="repourl", - help="URL of repo to import") - parser.add_option("-s",action="store_true",dest="sob", - default=False,help="Enable parsing Signed-off-by lines") - parser.add_option("-A","--authors",dest="authorfile", - help="Read authormap from AUTHORFILE") - parser.add_option("-f","--force",action="store_true",dest="force", - default=False,help="Ignore validation errors by force") - - (options,args)=parser.parse_args() - - m=-1 - if options.max!=None: m=options.max - - if options.marksfile==None: bail(parser,'--marks') - if options.marksfile==None: bail(parser,'--heads') - if options.marksfile==None: bail(parser,'--status') - if options.marksfile==None: bail(parser,'--repo') - - a={} - if options.authorfile!=None: - a=load_authors(options.authorfile) - - sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile, - options.statusfile,authors=a,sob=options.sob,force=options.force)) diff --git a/hg2git.sh b/hg2git.sh deleted file mode 100755 index 8bd96f4..0000000 --- a/hg2git.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/bin/sh - -ROOT="`dirname $0`" -REPO="" -PFX="hg2git" -SFX_MARKS="marks" -SFX_HEADS="heads" -SFX_STATE="state" -QUIET="" - -USAGE="[--quiet] [-r ] [-m ] [-s] [-A ]" -LONG_USAGE="Import hg repository up to either tip or -If is omitted, use last hg repository as obtained from state file, -GIT_DIR/$PFX-$SFX_STATE by default. - -Note: The argument order matters. - -Options: - -m Maximum revision to import - --quiet Passed to git-fast-import(1) - -s Enable parsing Signed-off-by lines - -A Read author map from file - (Same as in git-svnimport(1) and git-cvsimport(1)) - -r Mercurial repository to import -" - -. git-sh-setup -cd_to_toplevel - -while case "$#" in 0) break ;; esac -do - case "$1" in - -r|--r|--re|--rep|--repo) - shift - REPO="$1" - ;; - --q|--qu|--qui|--quie|--quiet) - QUIET="--quiet" - ;; - -*) - # pass any other options down to hg2git.py - break - ;; - *) - break - ;; - esac - shift -done - -# for convenience: get default repo from state file -if [ x"$REPO" = x -a -f "$GIT_DIR/$PFX-$SFX_STATE" ] ; then - REPO="`egrep '^:repo ' "$GIT_DIR/$PFX-$SFX_STATE" | cut -d ' ' -f 2`" - echo "Using last hg repository \"$REPO\"" -fi - -# make sure we have a marks cache -if [ ! -f "$GIT_DIR/$PFX-$SFX_MARKS" ] ; then - touch "$GIT_DIR/$PFX-$SFX_MARKS" -fi - -GIT_DIR="$GIT_DIR" python "$ROOT/hg2git.py" \ - --repo "$REPO" \ - --marks "$GIT_DIR/$PFX-$SFX_MARKS" \ - --heads "$GIT_DIR/$PFX-$SFX_HEADS" \ - --status "$GIT_DIR/$PFX-$SFX_STATE" \ - "$@" \ -| git-fast-import $QUIET --export-marks="$GIT_DIR/$PFX-$SFX_MARKS.tmp" \ -|| die 'Git fast-import failed' - -# move recent marks cache out of the way... -if [ -f "$GIT_DIR/$PFX-$SFX_MARKS" ] ; then - mv "$GIT_DIR/$PFX-$SFX_MARKS" "$GIT_DIR/$PFX-$SFX_MARKS.old" -else - touch "$GIT_DIR/$PFX-$SFX_MARKS.old" -fi - -# ...to create a new merged one -cat "$GIT_DIR/$PFX-$SFX_MARKS.old" "$GIT_DIR/$PFX-$SFX_MARKS.tmp" \ -| uniq > "$GIT_DIR/$PFX-$SFX_MARKS" - -# cleanup -rm -rf "$GIT_DIR/$PFX-$SFX_MARKS.old" "$GIT_DIR/$PFX-$SFX_MARKS.tmp" - -# save SHA1s of current heads for incremental imports -# and connectivity (plus sanity checking) -for head in `git branch | sed 's#^..##'` ; do - id="`git-rev-parse $head`" - echo ":$head $id" -done > "$GIT_DIR/$PFX-$SFX_HEADS" - -# check diff with color: -# ( for i in `find . -type f | grep -v '\.git'` ; do diff -u $i $REPO/$i ; done | cdiff ) | less -r diff --git a/hg2git.txt b/hg2git.txt deleted file mode 100644 index f4da258..0000000 --- a/hg2git.txt +++ /dev/null @@ -1,57 +0,0 @@ -hg2git.(sh|py) - mercurial to git converter using git-fast-import - -Legal -===== - -The scripts are licensed under the GPL version 2 and were written by -Rocco Rutte with hints and help from the git list and -#mercurial on freenode. - -Usage -===== - -Using it is quite simple for a mercurial repository : - - mkdir repo-git # or whatever - cd repo-git - git init - hg2git.sh -r - -Incremental imports to track hg repos is supported, too. - -Notes/Limitations -================= - -hg2git supports multiple branches but only named branches with exaclty -one head each. Otherwise commits to the tip of these heads within branch -will get flattened into merge commits. - -As each git-fast-import run creates a new pack file, it may be required -to repack the repository quite often for incremental imports (especially -when importing a small number of changesets per incremental import). - -Design -====== - -hg2git.py was designed in a way that doesn't require a 2-pass mechanism -or any prior repository analysis: if just feeds what it finds into -git-fast-import. This also implies that it heavily relies on strictly -linear ordering of changesets from hg, i.e. its append-only storage -model so that changesets hg2git already saw never get modified. - -Todo -==== - -For incremental imports, handling tags needs to be reworked (maybe): -Right now we assume that once a tag is created, it stays forever and -never changes. However, - - 1) tags in hg may be removed - 2) tags may change - -I'm not yet sure how to handle this and how this interferes with -non-hg-based tags in git. - -The same for branches: They may get removed. - -For one-time conversions, everything is fine.